In [None]:
# default_exp core

# pytrim2

> A python program for trimming and demultiplexing nanopore reads.

In [None]:
#hide

from nbdev.showdoc import *

In [None]:
#export

# Import dependencies
from Bio import SeqIO
from Bio import Align
from Bio.Seq import Seq
import numpy as np

In [None]:
#export

def findAlingments(record_dict, barcode_primer, inward_end, max_alignments):
    
    record_keys = list(record_dict.keys())

    aligner = Align.PairwiseAligner()
    aligner.match_score = 1.0
    aligner.mismatch_score = 0
    aligner.gap_score = -2
    aligner.mode = "local"

    n_sequences = len(record_keys)

    array_cols = max_alignments + 2
    al_array = np.zeros( (n_sequences, array_cols) )

    for i in list(range(0, n_sequences, 1)):
        al = []
        seq = record_dict[record_keys[i]].seq[0:inward_end]        
        alignments = aligner.align(seq, barcode_primer)
        len_alignments = len(alignments)
        if(len_alignments <= max_alignments):
            score = alignments.score
            al = [j.aligned for j in alignments]
            len_al = len(al)
            for k in range(0, len_al):
                al[k] = (al[k][0][0][1])
            al_array[i, 0:len(al)] = al # ends of each alignment
            al_array[i, -2] = len_alignments # number of alingments
            al_array[i, -1] = np.around(alignments.score/len(barcode_primer)*100) # normalized local alingnment score
            
    return(al_array)
    
    

In [None]:
#hide

record_dict = SeqIO.index("test_data/test.fasta", "fasta")
barcode_primer = Seq("CGCTCAGTTC")
ends = findAlingments(record_dict, barcode_primer, 200, 10)
print(ends)
np.sum(ends[:,-1] > 85)

[[ 90. 187.   0.   0.   0.   0.   0.   0.   0.   0.   2.  70.]
 [ 16.  34.  59.  72. 153.   0.   0.   0.   0.   0.   5.  60.]
 [ 36. 149. 156. 179.   0.   0.   0.   0.   0.   0.   4.  60.]]


0

In [None]:
def align_barcodes(primer_dict, record_dict, inward_end, max_alignments):
    primer_keys = list(primer_dict.keys())
    n_primers = len(primer_keys)
    # alingments_array = np.zeros( (n_primers, max_alignments + 2) )
    alingments = list( range(0, n_primers) )
    for i in list(range(0, n_primers, 1)):
        primer_seq_i = primer_dict[primer_keys[i]].seq
        alingments[i] = findAlingments(record_dict, primer_seq_i, 200, max_alignments)
    return(alingments)

In [None]:
#hide

primer_dict = SeqIO.index("test_data/test_primer.fasta", "fasta")
record_dict = SeqIO.index("test_data/test.fasta", "fasta")
align_barcodes(primer_dict, record_dict, 200, 5)

[array([[ 90., 187.,   0.,   0.,   0.,   2.,  70.],
        [ 16.,  34.,  59.,  72., 153.,   5.,  60.],
        [ 36., 149., 156., 179.,   0.,   4.,  60.]]),
 array([[ 43., 197.,   0.,   0.,   0.,   2.,  60.],
        [121.,   0.,   0.,   0.,   0.,   1.,  70.],
        [ 97., 186.,   0.,   0.,   0.,   2.,  60.]]),
 array([[  0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  8.,   0.,   0.,   0.,   0.,   1.,  60.],
        [138.,   0.,   0.,   0.,   0.,   1.,  70.]]),
 array([[ 94., 189.,   0.,   0.,   0.,   2.,  50.],
        [ 76.,   0.,   0.,   0.,   0.,   1.,  70.],
        [126.,   0.,   0.,   0.,   0.,   1.,  70.]]),
 array([[103., 118., 166.,   0.,   0.,   3.,  60.],
        [ 86., 198.,   0.,   0.,   0.,   2.,  60.],
        [108., 117.,   0.,   0.,   0.,   2.,  60.]]),
 array([[  0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [ 14.,  33.,  56.,  89., 177.,   5.,  50.],
        [ 24.,  35.,   0.,   0.,   0.,   2.,  60.]])]

In [None]:
#slow

primer_dict = SeqIO.index("test_data/test_primer.fasta", "fasta")
record_dict = SeqIO.index("test_data/test.fastq", "fastq")
alginment_arrays = align_barcodes(primer_dict, record_dict, 200, 10)

for i in alginment_arrays:
    print(np.sum(i[:,-1] >= 85))
    print(i[np.where(i[:,-1] >= 85),:])

1756
8514
903
1575
2116
76
