In [1]:
# default_exp core

# pytrim2

> A python program for trimming and demultiplexing nanopore reads.

In [210]:
#hide

from nbdev.showdoc import *

In [211]:
#export

# Import dependencies
from Bio import SeqIO
from Bio import Align
from Bio.Seq import Seq
import numpy as np

In [220]:
#export

def findAlingments(record_dict, barcode_primer, inward_end, max_alignments):
    
    record_keys = list(record_dict.keys())

    aligner = Align.PairwiseAligner()
    aligner.match_score = 1.0
    aligner.mismatch_score = 0
    aligner.gap_score = -2
    aligner.mode = "local"

    n_sequences = len(record_keys)

    array_cols = max_alignments + 2
    al_array = np.zeros( (n_sequences, array_cols) )

    for i in list(range(0, n_sequences, 1)):
        al = []
        seq = record_dict[record_keys[i]].seq[0:inward_end]        
        alignments = aligner.align(seq, barcode_primer)
        len_alignments = len(alignments)
        if(len_alignments <= max_alignments):
            score = alignments.score
            al = [j.aligned for j in alignments]
            len_al = len(al)
            for k in range(0, len_al):
                al[k] = (al[k][0][0][1])
            al_array[i, 0:len(al)] = al # ends of each alignment
            al_array[i, -3] = max(al) # maximum posistion of each alignment
            al_array[i, -2] = len_alignments # number of alingments
            al_array[i, -1] = np.around(alignments.score/len(barcode_primer)*100) # normalized local alingnment score
            
    return(al_array)
    
    

In [221]:
#hide

record_dict = SeqIO.index("test_data/test.fasta", "fasta")
barcode_primer = Seq("CGCTCAGTTC")
ends = findAlingments(record_dict, barcode_primer, 200, 10)

In [214]:
#export

def align_barcodes(primer_dict, record_dict, inward_end, max_alignments):
    primer_keys = list(primer_dict.keys())
    n_primers = len(primer_keys)
    # alingments_array = np.zeros( (n_primers, max_alignments + 2) )
    alingments = list( range(0, n_primers) )
    for i in list(range(0, n_primers, 1)):
        primer_seq_i = primer_dict[primer_keys[i]].seq
    
        alingments[i] = findAlingments(record_dict, primer_seq_i, 200, max_alignments)
    return(alingments)

In [215]:
#hide

primer_dict = SeqIO.index("test_data/test_primer.fasta", "fasta")
record_dict = SeqIO.index("test_data/test.fasta", "fasta")
alginment_arrays = align_barcodes(primer_dict, record_dict, 200, 7)
alginment_arrays

[array([[ 24.,   0.,   0.,   0.,   0.,   0.,  24.,   1., 100.],
        [ 16.,  59.,  72., 153.,   0.,   0., 153.,   4.,  60.],
        [149., 156., 179.,   0.,   0.,   0., 179.,   3.,  60.]]),
 array([[ 43., 197.,   0.,   0.,   0.,   0., 197.,   2.,  60.],
        [ 31.,   0.,   0.,   0.,   0.,   0.,  31.,   1., 100.],
        [ 28.,  97., 186.,   0.,   0.,   0., 186.,   3.,  60.]]),
 array([[ 75., 100., 103., 109.,   0.,   0., 109.,   4.,  50.],
        [  8.,  29.,   0.,   0.,   0.,   0.,  29.,   2.,  60.],
        [ 29.,   0.,   0.,   0.,   0.,   0.,  29.,   1., 100.]]),
 array([[ 94., 189.,   0.,   0.,   0.,   0., 189.,   2.,  50.],
        [ 76.,   0.,   0.,   0.,   0.,   0.,  76.,   1.,  70.],
        [126.,   0.,   0.,   0.,   0.,   0., 126.,   1.,  70.]]),
 array([[ 27., 103., 118.,   0.,   0.,   0., 118.,   3.,  60.],
        [ 30.,  86., 198.,   0.,   0.,   0., 198.,   3.,  60.],
        [108., 117.,   0.,   0.,   0.,   0., 117.,   2.,  60.]]),
 array([[ 18.,   0.,   0.,   0

In [195]:
for i in alginment_arrays:
    for j in i:
        print(np.max(j))
        #x = np.max(j[1:,range(0, j.shape[1] - 2)])
        #print(x)

100.0
0.0
0.0
197.0
100.0
0.0
0.0
60.0
100.0
189.0
76.0
126.0
0.0
0.0
117.0
70.0
0.0
0.0


In [177]:
x

array([[18.,  0.,  1., 70.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [148]:
# hide 
for i in alginment_arrays:
    print(np.sum((i[:,-1] >= 85) & (i[:,-2] == 1)))
    print(i[np.where(i[:,-1] >= 85),:])

1
[[[ 24.   0.   1. 100.]]]
1
[[[ 31.   0.   1. 100.]]]
1
[[[ 29.   0.   1. 100.]]]
0
[]
0
[]
0
[]


In [None]:
#slow

primer_dict = SeqIO.index("test_data/test_primer.fasta", "fasta")
record_dict = SeqIO.index("test_data/test.fastq", "fastq")
alginment_arrays = align_barcodes(primer_dict, record_dict, 200, 10)

for i in alginment_arrays:
    print(np.sum(i[:,-1] >= 85))
    print(i[np.where(i[:,-1] >= 85),:])