In [3]:
import sys
from pathlib import Path

# add parent folder to the path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)


In [4]:
from feito.utils.reconstruct_reads import ReconstructReads

## Reconstruct reads
Given the (1) index and (2) basecalled portions of reads, concatenate these portions to reconstruct the reads, than later will be mapped to the reference transcriptomes.


In [31]:
import pandas as pd 
# from tqdm.notebook import tqdm_notebook as tqdm
from tqdm import tqdm
from Bio import SeqIO
from collections import namedtuple, defaultdict

In [5]:
PATH_INDEX="../output-old/basecalling/simplenet-index.csv"
PATH_READS="../output-old/basecalling/simplenet-basecalled_reads.fa"
PATH_RECONSTRUCTED_READS="../output-old/basecalling/simplenet-full_reads.fa"
# index=pd.read_csv(PATH_INDEX, sep="\t")

reconstruct_reads = ReconstructReads(
    path_index=PATH_INDEX,
    path_basecalled_reads=PATH_READS,
    path_reconstructed_reads=PATH_RECONSTRUCTED_READS
)

100%|██████████| 30540/30540 [00:00<00:00, 542615.03it/s]


In [None]:
reconstruct_reads()

In [33]:
index_by_readid=defaultdict(list)
id2info={}
for read_id, idx, subsignal_id in  tqdm(zip(index.read_id, index.index.tolist(), index.subsignal_id.tolist()), total=index.shape[0]):
    index_by_readid[read_id].append((idx,subsignal_id))
    id2info[int(idx)] = (read_id, int(subsignal_id))

100%|██████████| 30540/30540 [00:00<00:00, 280456.55it/s]


In [34]:
index_by_readid

defaultdict(list,
            {'f8a47b45-b500-43b2-a8dc-d42512ed24c8': [(0, 0),
              (1, 1),
              (2, 2),
              (3, 3),
              (4, 4)],
             '3432765b-b762-4b5c-8ae2-aaa8b5c6413b': [(5, 0),
              (6, 1),
              (7, 2),
              (8, 3),
              (9, 4),
              (10, 5)],
             '610835c7-79d5-4fec-9eef-2497514cdf49': [(11, 0),
              (12, 1),
              (13, 2),
              (14, 3),
              (15, 4)],
             'efa6c9fe-f576-4b5b-b152-966e649c7481': [(16, 0),
              (17, 1),
              (18, 2),
              (19, 3),
              (20, 4),
              (21, 5),
              (22, 6),
              (23, 7),
              (24, 8),
              (25, 9),
              (26, 10),
              (27, 11),
              (28, 12)],
             '90c7d6f5-f141-4076-9b0b-15e76c6e4b6d': [(29, 0),
              (30, 1),
              (31, 2),
              (32, 3),
              (33, 4),
   

In [35]:
id2info

{0: ('f8a47b45-b500-43b2-a8dc-d42512ed24c8', 0),
 1: ('f8a47b45-b500-43b2-a8dc-d42512ed24c8', 1),
 2: ('f8a47b45-b500-43b2-a8dc-d42512ed24c8', 2),
 3: ('f8a47b45-b500-43b2-a8dc-d42512ed24c8', 3),
 4: ('f8a47b45-b500-43b2-a8dc-d42512ed24c8', 4),
 5: ('3432765b-b762-4b5c-8ae2-aaa8b5c6413b', 0),
 6: ('3432765b-b762-4b5c-8ae2-aaa8b5c6413b', 1),
 7: ('3432765b-b762-4b5c-8ae2-aaa8b5c6413b', 2),
 8: ('3432765b-b762-4b5c-8ae2-aaa8b5c6413b', 3),
 9: ('3432765b-b762-4b5c-8ae2-aaa8b5c6413b', 4),
 10: ('3432765b-b762-4b5c-8ae2-aaa8b5c6413b', 5),
 11: ('610835c7-79d5-4fec-9eef-2497514cdf49', 0),
 12: ('610835c7-79d5-4fec-9eef-2497514cdf49', 1),
 13: ('610835c7-79d5-4fec-9eef-2497514cdf49', 2),
 14: ('610835c7-79d5-4fec-9eef-2497514cdf49', 3),
 15: ('610835c7-79d5-4fec-9eef-2497514cdf49', 4),
 16: ('efa6c9fe-f576-4b5b-b152-966e649c7481', 0),
 17: ('efa6c9fe-f576-4b5b-b152-966e649c7481', 1),
 18: ('efa6c9fe-f576-4b5b-b152-966e649c7481', 2),
 19: ('efa6c9fe-f576-4b5b-b152-966e649c7481', 3),
 20: ('efa

In [36]:
reads_by_portions = defaultdict(list)

with open(PATH_READS,"r") as fp:
    reads=SeqIO.parse(fp,"fasta")
    nreads=0
    for read in reads:
        nreads=+1
        if nreads > 100:
            break
        idx = int(read.description)
        seq = read.seq

        read_id = id2info[idx][0]
        order_portion = id2info[idx][1]

        reads_by_portions[read_id].append((order_portion, seq))

In [37]:
reconstructed_reads = {}
for read_id, lseqs in reads_by_portions.items():

    lseqs = sorted(lseqs, key=lambda t: t[0])
    read = "".join([str(s[1]) for s in lseqs])
    reconstructed_reads[read_id] = read

In [38]:
reconstructed_reads

{'f8a47b45-b500-43b2-a8dc-d42512ed24c8': 'AGAGAGAAAAAAAGAAAGAGAGAGAGAGAGAGGAGAAGAAAGAGAGAGAAGAGAGAGAAGAGAGAAAGAGAGAGAGGAGAAAGAGAGAGAAGAAGAGAGAGAAGAGAAGAGAAGAGAAGGAAGAGAAGAAGAGA',
 '3432765b-b762-4b5c-8ae2-aaa8b5c6413b': 'GAGAGAAGAGAGAGAGAGAAAAAAAGAGAGAGAGAGAAGAGAAGAGAAGAAGAAAAACGAAAGAAAAGAGAAGAGAGAAGAAAGAAGAGAGAAGAGAGAA',
 '610835c7-79d5-4fec-9eef-2497514cdf49': 'GAAGAAAAAAAAAAAAAAAGAAAGAGAGAAAAGAAGAGAGAGAGGAGAAGAGGAGAGGAGAAGGAGAGAGAAGAGAGAGAAGAAGAGAGAGAG',
 'efa6c9fe-f576-4b5b-b152-966e649c7481': 'ACGAAAAAACAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAGAGAGAAGAGGGGGAGAGAAGAAGAAAAAAAAAGAGAGGAGAAGAGAGAGAAGAGAGAGAGAGGAGAGAGAGAGGAGAGGAGAAGGAGGAGAGAGAGAGGAAGAAGACGAGAGGAGAGAGAGAGACGAGAAAGAGGAGAGAGACGAGAGAAGAGAGAAAGAAGAGAAGAGAGAGAGACGAGAGAGAGAGAGAGGAGAGAGAGAGAGAGAGAGAAAGAGAGAGGAGAGAGGAGAGAGAGAAGAGAAGAAGAGAGAGAAG',
 '90c7d6f5-f141-4076-9b0b-15e76c6e4b6d': 'GGAGAAAAAAGAGAGAGAAGGAGAGAGAGGAAGAGAGAAGAGAGAAGAGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGACGGAGGAGAGAGAGAGAAAGAGAAAAAAAGACAAGAAGAGAGAGAGAAGAGAAAAA

In [41]:
### reconstructed reads to fasta

with open(PATH_RECONSTRUCTED_READS, "a") as fp: 
    for readid, read in reconstructed_reads.items():
        fp.write(f">{readid}\n")
        fp.write(read+"\n")

___