In [42]:
import pandas as pd
import numpy as np
import scipy
import scipy.sparse
import scipy.stats
import os
import scipy.io as sio
import regex as re
from collections import Counter, defaultdict
#from pylab import *
#import matplotlib.pyplot as plt
import sys 
#%matplotlib inline
import gzip

from skbio import TabularMSA, DNA
from skbio.alignment import local_pairwise_align_ssw

CONST_A = 0
CONST_C = 1
CONST_G = 2
CONST_T = 3

CONST_NT_MAP = ['A', 'C', 'G', 'T']

def distance(astring, bstring) :
    distance = 0
    
    limit = len(astring)
    diff = len(bstring) - len(astring)
    if len(bstring) < len(astring) :
        limit = len(bstring)
        diff = len(astring) - len(bstring)
    
    for i in range(limit) :
        if astring[i] != bstring[i] :
            distance += 1
    return distance + diff

def reverse_complement(seq) :
    rc_seq = ''
    for i in range(0, len(seq)) :
        if seq[i] == 'A' :
            rc_seq = 'T' + rc_seq
        elif seq[i] == 'C' :
            rc_seq = 'G' + rc_seq
        elif seq[i] == 'G' :
            rc_seq = 'C' + rc_seq
        elif seq[i] == 'T' :
            rc_seq = 'A' + rc_seq
    return rc_seq

def get_hamming_neighbor_1(seq, seq_map, start_r, end_r) :
    for i in range(start_r, end_r) :
        for base1 in CONST_NT_MAP :
            mut_seq = seq[:i] + base1 + seq[i+1:]
            if mut_seq in seq_map :
                return mut_seq
    return None

def get_hamming_neighbor_2(seq, seq_map, start_r, end_r) :
    for i in range(start_r, end_r) :
        for j in range(i + 1, end_r) :
            for base1 in CONST_NT_MAP :
                for base2 in CONST_NT_MAP :
                    mut_seq = seq[:i] + base1 + seq[i+1:j] + base2 + seq[j+1:]
                    if mut_seq in seq_map :
                        return mut_seq
    return None

In [37]:
dna_file_prefix = 'doubledope_dna_hamming2'

dna_df = pd.read_csv(dna_file_prefix + '.csv',sep=',')

dna_barcode_list = list(dna_df.barcode)
dna_sequence_list = list(dna_df.sequence)

dna_barcode_map = {}
dna_sequence_map = {}

for i in range(0, len(dna_barcode_list)) :
    dna_barcode_map[dna_barcode_list[i]] = dna_barcode_list[i]
    dna_sequence_map[dna_barcode_list[i]] = dna_sequence_list[i]

In [38]:
print(len(dna_barcode_map))
print(len(dna_sequence_map))

2551978
2551978


In [39]:

#Remove sequences suspected of internal priming conflicts

misprime_regexes = [
    #re.compile(r"(AAAAAAA)"),
    re.compile(r"(AAAAAAAAAAAA){s<=2}"),#12 A
    re.compile(r"(AAAAAAAAAAAAAAAA){s<=4}"),
    re.compile(r"(AAAAAAAAAAAAAAAAAAAA){s<=5}")
]

for i in range(0, len(dna_barcode_list)) :
    if i % 500000 == 0 :
        print('Removing mispriming suspects at sequence ' + str(i))
    
    curr_seq = dna_sequence_map[dna_barcode_list[i]]
    
    for misprime_regex in misprime_regexes :
        if re.search(misprime_regex, curr_seq) :
            del dna_barcode_map[dna_barcode_list[i]]
            del dna_sequence_map[dna_barcode_list[i]]

            break

print(len(dna_barcode_map))
print(len(dna_sequence_map))


Removing mispriming suspects at sequence 0
Removing mispriming suspects at sequence 500000
Removing mispriming suspects at sequence 1000000
Removing mispriming suspects at sequence 1500000
Removing mispriming suspects at sequence 2000000
Removing mispriming suspects at sequence 2500000
2123419
2123419


In [41]:

dna_barcode_list = []
dna_sequence_list = []

for barcode in dna_barcode_map :
    dna_barcode_list.append(barcode)
    dna_sequence_list.append(dna_sequence_map[barcode])

df = pd.DataFrame({'barcode':    dna_barcode_list,
                   'sequence':   dna_sequence_list})

print(len(df))

new_columns = ['barcode', 'sequence']

df.to_csv(dna_file_prefix + '_antimisprime.csv', sep=',', header=True, columns=new_columns, index=False)


2123419


In [56]:
r1_rna = 'r1.fq.gz'
r2_rna = 'r2.fq.gz'
i1_rna = 'i1.fq.gz'

distal_utr = 'gtgccttctagttgccagccatctgttgtttgcccctcccccgtgccttccttgaccctggaaggtgccactcccactgtcctttcctaataaaatgaggaaattgcatcgcattgtctgagtaggtgtcattctattctggggggtggggtggggcaggacagcaaggg'.upper()
#distal_utr = 'cctaataaaatgaggaaattgcatcgcattgtctgagtaggtgtcattctattctggggggtggggtggggcaggacagcaaggg'.upper()

print(distal_utr)

distal_utr_DNA = DNA(distal_utr)

GTGCCTTCTAGTTGCCAGCCATCTGTTGTTTGCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTAATAAAATGAGGAAATTGCATCGCATTGTCTGAGTAGGTGTCATTCTATTCTGGGGGGTGGGGTGGGGCAGGACAGCAAGGG


In [57]:
print(len('gtgccttctagttgccagccatctgttgtttgcccctcccccgtgccttccttgaccctggaaggtgccactcccactgtccttt'))
print(len('tgaggaaattgcatcgcattgtctgagtaggtgtcattctattctggggggtggggtggggcaggacagcaaggg'))

85
75


In [73]:
f = {}
f[0] = gzip.open(r1_rna,'rt')
f[1] = gzip.open(r2_rna,'rt')

i1 = gzip.open(i1_rna, 'rt')


start_from_count = 0

file_action = 'w'
if start_from_count > 0 :
    file_action = 'a'


head, seq, pr, q = ({} for i in range(4))
count = 0

total_matched_count = 0
total_mapped_count = 0

matched_on_dict_count = 0
matched_on_hamming1_count = 0
matched_on_hamming2_count = 0

print('Processing RNA reads.')

out = open('doubledope_rna_mapped_hammingsearch1_all.csv', file_action)
out.write('barcode,umi,rna_read,align_start_ref,align_end_ref,align_start_read,align_end_read,polya_pos,align_score\n')

score_filter = 30

while True:
    for i in range(2):
        head[i] = f[i].readline().rstrip()
        seq[i] = f[i].readline().rstrip()
        pr[i] = f[i].readline().rstrip()
        q[i] = f[i].readline().rstrip()
        
    headi = i1.readline().rstrip()
    seqi = i1.readline().rstrip()
    pri = i1.readline().rstrip()
    qi = i1.readline().rstrip()
    
    if len(seq[0]) == 0:
        break # End of File
    
    if count < start_from_count :
        count += 1
        continue
    
    barcode = seq[0]
    umi = seqi
    rna_seq = reverse_complement(seq[1])

    barcode_key = barcode
    matched = False
    if barcode in dna_barcode_map :
        matched = True
        matched_on_dict_count += 1
    else :
        barcode_h1 = get_hamming_neighbor_1(barcode, dna_barcode_map, 0, 20)
        if barcode_h1 != None :
            matched = True
            barcode_key = barcode_h1
            matched_on_hamming1_count += 1
        '''else :
            barcode_h2 = get_hamming_neighbor_2(barcode, dna_barcode_map, 0, 20)
            if barcode_h2 != None :
                matched = True
                barcode_key = barcode_h2
                matched_on_hamming2_count += 1'''
    
    if matched == True :
        total_matched_count += 1
        
        rna_seq_DNA = DNA(rna_seq)
        polya_pos = -1
        start_end_positions = None
        score = -10000
        
        ref_seq = dna_sequence_map[barcode_key]
        
        candidate_tuple = local_pairwise_align_ssw(DNA(ref_seq),rna_seq_DNA,score_filter=score_filter)
        if candidate_tuple != None :
            _, score, start_end_positions = candidate_tuple
            polya_pos = start_end_positions[0][1] + (50 - 1 - start_end_positions[1][1]) #+ 73
        else :
            candidate_tuple = local_pairwise_align_ssw(distal_utr_DNA,rna_seq_DNA,score_filter=score_filter)
            if candidate_tuple != None :
                _, score, start_end_positions = candidate_tuple
                polya_pos = start_end_positions[0][1] + (50 - 1 - start_end_positions[1][1]) + 207 #+ 85##+ 280
        
        if polya_pos != -1 :
            total_mapped_count += 1
            
            out.write(barcode_key)
            out.write(',' + umi)
            out.write(',' + rna_seq)
            out.write(',' + str(start_end_positions[0][0]))
            out.write(',' + str(start_end_positions[0][1]))
            out.write(',' + str(start_end_positions[1][0]))
            out.write(',' + str(start_end_positions[1][1]))
            out.write(',' + str(polya_pos))
            out.write(',' + str(score))
            out.write('\n')
        
    if count % 1000000 == 0:
        print('Count: ' + str(count))
        print('Reads matched against barcodes: ' + str(total_matched_count))
        print('Reads mapped to reference: ' + str(total_mapped_count))
        
        print('Matched on dictionary: ' + str(matched_on_dict_count))
        print('Matched on hamming 1: ' + str(matched_on_hamming1_count))
        print('Matched on hamming 2: ' + str(matched_on_hamming2_count))
        
    count += 1
    
print('COMPLETE')
print('Reads matched against barcodes: ' + str(total_matched_count))
print('Reads mapped to reference: ' + str(total_mapped_count))

print('Matched on dictionary: ' + str(matched_on_dict_count))
print('Matched on hamming 1: ' + str(matched_on_hamming1_count))
print('Matched on hamming 2: ' + str(matched_on_hamming2_count))

out.close()

f[0].close()
f[1].close()
i1.close()

Processing RNA reads.
Count: 0
Reads matched against barcodes: 0
Reads mapped to reference: 0
Matched on dictionary: 0
Matched on hamming 1: 0
Matched on hamming 2: 0
Count: 1000000
Reads matched against barcodes: 251463
Reads mapped to reference: 148826
Matched on dictionary: 237348
Matched on hamming 1: 14115
Matched on hamming 2: 0
Count: 2000000
Reads matched against barcodes: 504626
Reads mapped to reference: 266146
Matched on dictionary: 479797
Matched on hamming 1: 24829
Matched on hamming 2: 0
Count: 3000000
Reads matched against barcodes: 745495
Reads mapped to reference: 356175
Matched on dictionary: 707509
Matched on hamming 1: 37986
Matched on hamming 2: 0
Count: 4000000
Reads matched against barcodes: 982284
Reads mapped to reference: 480572
Matched on dictionary: 930592
Matched on hamming 1: 51692
Matched on hamming 2: 0
Count: 5000000
Reads matched against barcodes: 1219763
Reads mapped to reference: 623413
Matched on dictionary: 1154522
Matched on hamming 1: 65241
Match

Count: 47000000
Reads matched against barcodes: 11455961
Reads mapped to reference: 7715065
Matched on dictionary: 10742654
Matched on hamming 1: 713307
Matched on hamming 2: 0
Count: 48000000
Reads matched against barcodes: 11698418
Reads mapped to reference: 7887591
Matched on dictionary: 10969621
Matched on hamming 1: 728797
Matched on hamming 2: 0
Count: 49000000
Reads matched against barcodes: 11943983
Reads mapped to reference: 8064555
Matched on dictionary: 11199152
Matched on hamming 1: 744831
Matched on hamming 2: 0
Count: 50000000
Reads matched against barcodes: 12188820
Reads mapped to reference: 8246401
Matched on dictionary: 11427646
Matched on hamming 1: 761174
Matched on hamming 2: 0
Count: 51000000
Reads matched against barcodes: 12434244
Reads mapped to reference: 8427280
Matched on dictionary: 11657171
Matched on hamming 1: 777073
Matched on hamming 2: 0
Count: 52000000
Reads matched against barcodes: 12681205
Reads mapped to reference: 8611009
Matched on dictionary: 

Count: 93000000
Reads matched against barcodes: 22601829
Reads mapped to reference: 15646848
Matched on dictionary: 21173321
Matched on hamming 1: 1428508
Matched on hamming 2: 0
Count: 94000000
Reads matched against barcodes: 22842829
Reads mapped to reference: 15835472
Matched on dictionary: 21397858
Matched on hamming 1: 1444971
Matched on hamming 2: 0
Count: 95000000
Reads matched against barcodes: 23085258
Reads mapped to reference: 16024515
Matched on dictionary: 21623497
Matched on hamming 1: 1461761
Matched on hamming 2: 0
Count: 96000000
Reads matched against barcodes: 23327517
Reads mapped to reference: 16211788
Matched on dictionary: 21849232
Matched on hamming 1: 1478285
Matched on hamming 2: 0
Count: 97000000
Reads matched against barcodes: 23570924
Reads mapped to reference: 16400306
Matched on dictionary: 22075771
Matched on hamming 1: 1495153
Matched on hamming 2: 0
Count: 98000000
Reads matched against barcodes: 23813648
Reads mapped to reference: 16588749
Matched on d

Count: 139000000
Reads matched against barcodes: 33694530
Reads mapped to reference: 23546622
Matched on dictionary: 31543617
Matched on hamming 1: 2150913
Matched on hamming 2: 0
Count: 140000000
Reads matched against barcodes: 33943565
Reads mapped to reference: 23684565
Matched on dictionary: 31778378
Matched on hamming 1: 2165187
Matched on hamming 2: 0
Count: 141000000
Reads matched against barcodes: 34181162
Reads mapped to reference: 23832019
Matched on dictionary: 31997559
Matched on hamming 1: 2183603
Matched on hamming 2: 0
Count: 142000000
Reads matched against barcodes: 34433258
Reads mapped to reference: 24028859
Matched on dictionary: 32235223
Matched on hamming 1: 2198035
Matched on hamming 2: 0
Count: 143000000
Reads matched against barcodes: 34672805
Reads mapped to reference: 24199356
Matched on dictionary: 32459487
Matched on hamming 1: 2213318
Matched on hamming 2: 0
Count: 144000000
Reads matched against barcodes: 34909536
Reads mapped to reference: 24362495
Matche

Count: 185000000
Reads matched against barcodes: 44759600
Reads mapped to reference: 31018557
Matched on dictionary: 41902328
Matched on hamming 1: 2857272
Matched on hamming 2: 0
Count: 186000000
Reads matched against barcodes: 44993279
Reads mapped to reference: 31188163
Matched on dictionary: 42120589
Matched on hamming 1: 2872690
Matched on hamming 2: 0
Count: 187000000
Reads matched against barcodes: 45228402
Reads mapped to reference: 31360509
Matched on dictionary: 42339795
Matched on hamming 1: 2888607
Matched on hamming 2: 0
Count: 188000000
Reads matched against barcodes: 45464354
Reads mapped to reference: 31530341
Matched on dictionary: 42560623
Matched on hamming 1: 2903731
Matched on hamming 2: 0
Count: 189000000
Reads matched against barcodes: 45699570
Reads mapped to reference: 31695183
Matched on dictionary: 42779771
Matched on hamming 1: 2919799
Matched on hamming 2: 0
Count: 190000000
Reads matched against barcodes: 45935287
Reads mapped to reference: 31858964
Matche

Count: 231000000
Reads matched against barcodes: 55679667
Reads mapped to reference: 38492774
Matched on dictionary: 52105198
Matched on hamming 1: 3574469
Matched on hamming 2: 0
Count: 232000000
Reads matched against barcodes: 55919335
Reads mapped to reference: 38651241
Matched on dictionary: 52329661
Matched on hamming 1: 3589674
Matched on hamming 2: 0
Count: 233000000
Reads matched against barcodes: 56162449
Reads mapped to reference: 38799553
Matched on dictionary: 52557656
Matched on hamming 1: 3604793
Matched on hamming 2: 0
Count: 234000000
Reads matched against barcodes: 56408224
Reads mapped to reference: 38938257
Matched on dictionary: 52789894
Matched on hamming 1: 3618330
Matched on hamming 2: 0
Count: 235000000
Reads matched against barcodes: 56655902
Reads mapped to reference: 39053237
Matched on dictionary: 53023785
Matched on hamming 1: 3632117
Matched on hamming 2: 0
Count: 236000000
Reads matched against barcodes: 56892748
Reads mapped to reference: 39190777
Matche

Count: 277000000
Reads matched against barcodes: 66866720
Reads mapped to reference: 45832957
Matched on dictionary: 62570517
Matched on hamming 1: 4296203
Matched on hamming 2: 0
Count: 278000000
Reads matched against barcodes: 67112424
Reads mapped to reference: 46021819
Matched on dictionary: 62799762
Matched on hamming 1: 4312662
Matched on hamming 2: 0
Count: 279000000
Reads matched against barcodes: 67358941
Reads mapped to reference: 46213885
Matched on dictionary: 63029638
Matched on hamming 1: 4329303
Matched on hamming 2: 0
Count: 280000000
Reads matched against barcodes: 67605928
Reads mapped to reference: 46406656
Matched on dictionary: 63260134
Matched on hamming 1: 4345794
Matched on hamming 2: 0
Count: 281000000
Reads matched against barcodes: 67853499
Reads mapped to reference: 46601715
Matched on dictionary: 63491638
Matched on hamming 1: 4361861
Matched on hamming 2: 0
Count: 282000000
Reads matched against barcodes: 68097249
Reads mapped to reference: 46781459
Matche

Count: 323000000
Reads matched against barcodes: 77952430
Reads mapped to reference: 53538480
Matched on dictionary: 72922050
Matched on hamming 1: 5030380
Matched on hamming 2: 0
Count: 324000000
Reads matched against barcodes: 78192583
Reads mapped to reference: 53718413
Matched on dictionary: 73146219
Matched on hamming 1: 5046364
Matched on hamming 2: 0
Count: 325000000
Reads matched against barcodes: 78432244
Reads mapped to reference: 53898119
Matched on dictionary: 73370605
Matched on hamming 1: 5061639
Matched on hamming 2: 0
Count: 326000000
Reads matched against barcodes: 78673222
Reads mapped to reference: 54080616
Matched on dictionary: 73594837
Matched on hamming 1: 5078385
Matched on hamming 2: 0
Count: 327000000
Reads matched against barcodes: 78915312
Reads mapped to reference: 54267251
Matched on dictionary: 73819999
Matched on hamming 1: 5095313
Matched on hamming 2: 0
Count: 328000000
Reads matched against barcodes: 79158194
Reads mapped to reference: 54453491
Matche

Count: 369000000
Reads matched against barcodes: 89019942
Reads mapped to reference: 60940581
Matched on dictionary: 83283866
Matched on hamming 1: 5736076
Matched on hamming 2: 0
Count: 370000000
Reads matched against barcodes: 89255839
Reads mapped to reference: 61105679
Matched on dictionary: 83504601
Matched on hamming 1: 5751238
Matched on hamming 2: 0
Count: 371000000
Reads matched against barcodes: 89490175
Reads mapped to reference: 61268579
Matched on dictionary: 83724363
Matched on hamming 1: 5765812
Matched on hamming 2: 0
Count: 372000000
Reads matched against barcodes: 89727012
Reads mapped to reference: 61435079
Matched on dictionary: 83946133
Matched on hamming 1: 5780879
Matched on hamming 2: 0
Count: 373000000
Reads matched against barcodes: 89963143
Reads mapped to reference: 61600941
Matched on dictionary: 84167353
Matched on hamming 1: 5795790
Matched on hamming 2: 0
Count: 374000000
Reads matched against barcodes: 90204258
Reads mapped to reference: 61769575
Matche

Count: 415000000
Reads matched against barcodes: 100164717
Reads mapped to reference: 68725550
Matched on dictionary: 93744301
Matched on hamming 1: 6420416
Matched on hamming 2: 0
Count: 416000000
Reads matched against barcodes: 100411432
Reads mapped to reference: 68873492
Matched on dictionary: 93977293
Matched on hamming 1: 6434139
Matched on hamming 2: 0
Count: 417000000
Reads matched against barcodes: 100659511
Reads mapped to reference: 69013697
Matched on dictionary: 94210653
Matched on hamming 1: 6448858
Matched on hamming 2: 0
Count: 418000000
Reads matched against barcodes: 100909831
Reads mapped to reference: 69157489
Matched on dictionary: 94448767
Matched on hamming 1: 6461064
Matched on hamming 2: 0
Count: 419000000
Reads matched against barcodes: 101143809
Reads mapped to reference: 69294624
Matched on dictionary: 94663446
Matched on hamming 1: 6480363
Matched on hamming 2: 0
Count: 420000000
Reads matched against barcodes: 101379163
Reads mapped to reference: 69461194


Count: 461000000
Reads matched against barcodes: 111236984
Reads mapped to reference: 76379705
Matched on dictionary: 104137472
Matched on hamming 1: 7099512
Matched on hamming 2: 0
Count: 462000000
Reads matched against barcodes: 111475449
Reads mapped to reference: 76562210
Matched on dictionary: 104360130
Matched on hamming 1: 7115319
Matched on hamming 2: 0
Count: 463000000
Reads matched against barcodes: 111715050
Reads mapped to reference: 76744904
Matched on dictionary: 104583913
Matched on hamming 1: 7131137
Matched on hamming 2: 0
Count: 464000000
Reads matched against barcodes: 111954418
Reads mapped to reference: 76923427
Matched on dictionary: 104807599
Matched on hamming 1: 7146819
Matched on hamming 2: 0
Count: 465000000
Reads matched against barcodes: 112186074
Reads mapped to reference: 77093117
Matched on dictionary: 105024934
Matched on hamming 1: 7161140
Matched on hamming 2: 0
Count: 466000000
Reads matched against barcodes: 112415722
Reads mapped to reference: 7726

In [51]:
out.close()

f[0].close()
f[1].close()
i1.close()

In [72]:

print(dna_sequence_map['CTGGTAACGACCTTGCTGCA'])

print(len('CATTTGTCGCATCCAGAGATGGAAAGGAGGAGGCGTCTGC'))


CATTACTCGCATCCAGAGATGGAAAGGAGGAGGCCTCACCAAAAAATCAAAGTGACCATAGACGAATAGGAAATCAGAGAGTTGGCCAGCCAATTAAGCCCCTAAACTCACCGGCCTCCCCTCCCAAAAAAATTGATGACTCACGACGCGAACTCAAGTCTTTTTGTTCAT
40
125


In [None]:

CATTACTCGCATCCAGAGATGGAAAGGAGGAGGCCTCACCAAAAAATCAAAGTGACCATAGACGAATAGGAAATCAGAGAGTTGGCCAGCCAATTAAGCCCCTAAACTCACCGGCCTCCCC
CATTTGTCGCATCCAGAGATGGAAAGGAGGAGGCGTCTGC

39

#TODO also output start alignment position to file then re-run 10MIL test





