In [4]:
import pandas as pd
import numpy as np
import os
#import re
import regex as re
from collections import Counter, defaultdict
import sys
import pickle

CONST_A = 0
CONST_C = 1
CONST_G = 2
CONST_T = 3

CONST_NT_MAP = ['A', 'C', 'G', 'T']

def reverse_complement(seq) :
    rc_seq = ''
    for i in range(0, len(seq)) :
        if seq[i] == 'A' :
            rc_seq = 'T' + rc_seq
        elif seq[i] == 'C' :
            rc_seq = 'G' + rc_seq
        elif seq[i] == 'G' :
            rc_seq = 'C' + rc_seq
        elif seq[i] == 'T' :
            rc_seq = 'A' + rc_seq
    return rc_seq

def distance(astring, bstring) :
    distance = 0
    
    limit = len(astring)
    diff = len(bstring) - len(astring)
    if len(bstring) < len(astring) :
        limit = len(bstring)
        diff = len(astring) - len(bstring)
    
    for i in range(limit) :
        if astring[i] != bstring[i] :
            distance += 1
    return distance + diff

def increment_bp_map(seq, bp_map, magnitude=1) :
    for i in range(0, len(seq)) :
        if seq[i] == 'A' :
            bp_map[i][CONST_A] += magnitude
        elif seq[i] == 'C' :
            bp_map[i][CONST_C] += magnitude
        elif seq[i] == 'G' :
            bp_map[i][CONST_G] += magnitude
        elif seq[i] == 'T' :
            bp_map[i][CONST_T] += magnitude
    return bp_map

def get_consensus_sequence(bp_map) :
    seq = ''
    for i in range(0, len(bp_map)) :
        max_count = 0
        max_j = 0
        for j in range(0, 4) :
            if bp_map[i][j] > max_count :
                max_count = bp_map[i][j]
                max_j = j
        seq += CONST_NT_MAP[max_j]
    return seq


In [5]:
r1 = 'r1_dna.fq'
r2 = 'r2_dna.fq'

In [6]:
tag_one = re.compile(r"(CATTACTCGCATCCA){s<=2}")#s<=1
tag_two = re.compile(r"(CAGCCAATTAAGCC){s<=2}")#s<=1

In [7]:
f = {}
f[0] = open(r1,'r')
f[1] = open(r2,'r')

head, seq, pr, q = ({} for i in range(4))
count = 0

n_passed = 0
n_failed = 0

f_out = open('dna_doubledope_raw_barcode_reads.csv', 'w')

barcode_sequence_count_dict = {}

while True:
    for i in range(2):
        head[i] = f[i].readline().rstrip()
        seq[i] = f[i].readline().rstrip()
        pr[i] = f[i].readline().rstrip()
        q[i] = f[i].readline().rstrip()
    if len(seq[0]) == 0:
        break # End of File
    
    scan_one_r1   = re.search(tag_one, seq[0][20:35])
    scan_two_r1   = re.search(tag_two, seq[0][106:120])
    #scan_one_r2rc = re.search(tag_one, rc_seq[29:44])
    #scan_two_r2rc = re.search(tag_two, rc_seq[115:129])
    
    rc_seq = reverse_complement(seq[1])
    if scan_one_r1 is not None and scan_two_r1 is not None and distance(seq[0][:191], rc_seq[9:200]) <= 2 :#<=1
        sequence = seq[0][20:191]
        barcode = seq[0][:20]
        
        f_out.write(barcode + ',' + str(n_passed) + '\n')
        
        if barcode not in barcode_sequence_count_dict :
            barcode_sequence_count_dict[barcode] = {}
        
        if sequence not in barcode_sequence_count_dict[barcode] :
            barcode_sequence_count_dict[barcode][sequence] = 0
        
        barcode_sequence_count_dict[barcode][sequence] += 1
        
        n_passed += 1
    else :
        n_failed += 1
            
    if (count % 1000000) == 0 :
        print(count)
        print('Num dna reads passed: ' + str(n_passed))
        print('Num dna reads failed: ' + str(n_failed))
    count += 1

print('COMPLETE')
print('Num dna reads passed: ' + str(n_passed))
print('Num dna reads failed: ' + str(n_failed))

pickle.dump(barcode_sequence_count_dict, open('barcode_sequence_count_dict.pickle', 'wb'))

f[0].close()
f[1].close()
f_out.close()

0
Num dna reads passed: 0
Num dna reads failed: 1
1000000
Num dna reads passed: 644591
Num dna reads failed: 355410
2000000
Num dna reads passed: 1302884
Num dna reads failed: 697117
3000000
Num dna reads passed: 1950318
Num dna reads failed: 1049683
4000000
Num dna reads passed: 2575455
Num dna reads failed: 1424546
5000000
Num dna reads passed: 3184032
Num dna reads failed: 1815969
6000000
Num dna reads passed: 3826062
Num dna reads failed: 2173939
7000000
Num dna reads passed: 4473896
Num dna reads failed: 2526105
8000000
Num dna reads passed: 5099515
Num dna reads failed: 2900486
9000000
Num dna reads passed: 5729395
Num dna reads failed: 3270606
10000000
Num dna reads passed: 6367827
Num dna reads failed: 3632174
11000000
Num dna reads passed: 7014495
Num dna reads failed: 3985506
12000000
Num dna reads passed: 7656587
Num dna reads failed: 4343414
13000000
Num dna reads passed: 8296389
Num dna reads failed: 4703612
14000000
Num dna reads passed: 8933237
Num dna reads failed: 5066

In [None]:

#barcode_sequence_count_dict = pickle.load(open('barcode_sequence_count_dict.pickle', 'rb'))


In [None]:
#Bartender command
#bartender_single_com -f dna_doubledope_raw_barcode_reads.csv -o doubledope_hamming2 -c 2 -d 2 -z 5 -t 4
#2551978 unique barcodes

In [20]:
#Read and map Bartender clustered barcodes, accumulate consensus sequences

bartender_prefix = 'doubledope_hamming2'

cluster_dict = pd.read_csv(bartender_prefix + '_cluster.csv', delimiter=',').set_index('Cluster.ID').to_dict(orient='index')
print(len(cluster_dict))

barcode_dict = pd.read_csv(bartender_prefix + '_barcode.csv', delimiter=',').set_index('Unique.reads').to_dict(orient='index')
print(len(barcode_dict))

2551978
2782080


In [11]:
cluster_sequence_count_dict = {}

i = 0
for raw_barcode in barcode_dict :
    if i % 500000 == 0 :
        print('Processing barcode ' + str(i))
    
    cluster_id = barcode_dict[raw_barcode]['Cluster.ID']
    cluster = cluster_dict[cluster_id]
    clustered_barcode = cluster['Center']
    
    raw_sequence_dict = barcode_sequence_count_dict[raw_barcode]
    
    if clustered_barcode not in cluster_sequence_count_dict :
        cluster_sequence_count_dict[clustered_barcode] = {}
    
    for raw_sequence in raw_sequence_dict :
        if raw_sequence not in cluster_sequence_count_dict[clustered_barcode] :
            cluster_sequence_count_dict[clustered_barcode][raw_sequence] = 0
        
        cluster_sequence_count_dict[clustered_barcode][raw_sequence] += raw_sequence_dict[raw_sequence]

    i += 1
    

2551978
2782080
Processing barcode 0
Processing barcode 500000
Processing barcode 1000000
Processing barcode 1500000
Processing barcode 2000000
Processing barcode 2500000


In [12]:
pickle.dump(cluster_sequence_count_dict, open('cluster_sequence_count_dict.pickle', 'wb'))

In [13]:
barcode_sequence_count_dict = None
cluster_dict = None
barcode_dict = None

In [16]:
#Basepair vote consensus sequences

seq_dict = {}
count_dict = {}

i = 0
for barcode in cluster_sequence_count_dict :
    if i % 500000 == 0 :
        print('Processing barcode ' + str(i))
    
    sequence_count_dict = cluster_sequence_count_dict[barcode]
    
    bp_map = np.zeros((171, 4))
    
    total_count = 0
    
    for sequence in sequence_count_dict :
        sequence_count = sequence_count_dict[sequence]
        for j in range(0, len(sequence)) :
            if sequence[j] == 'A' :
                bp_map[j, 0] += sequence_count
            elif sequence[j] == 'C' :
                bp_map[j, 1] += sequence_count
            elif sequence[j] == 'G' :
                bp_map[j, 2] += sequence_count
            elif sequence[j] == 'T' :
                bp_map[j, 3] += sequence_count
        
        total_count += sequence_count
    
    consensus_sequence = ''
    for j in range(0, 171) :
        max_i = int(np.argmax(bp_map[j, :]))
        
        if max_i == 0 :
            consensus_sequence += 'A'
        elif max_i == 1 :
            consensus_sequence += 'C'
        elif max_i == 2 :
            consensus_sequence += 'G'
        elif max_i == 3 :
            consensus_sequence += 'T'
    
    seq_dict[barcode] = consensus_sequence
    count_dict[barcode] = total_count
    
    i += 1

Processing barcode 0
Processing barcode 500000
Processing barcode 1000000
Processing barcode 1500000
Processing barcode 2000000
Processing barcode 2500000
165200


In [18]:
print(len(seq_dict))

2551978


In [19]:
dna_barcode_list = []
dna_seq_list = []
dna_count_list = []

for barcode in seq_dict :
    dna_barcode_list.append(barcode)
    dna_seq_list.append(seq_dict[barcode])
    dna_count_list.append(count_dict[barcode])

df = pd.DataFrame({'barcode':    dna_barcode_list,
                   'sequence':   dna_seq_list,
                   'read_count': dna_count_list})

df = df.sort_values(by='read_count', ascending=False)

print(len(df))

new_columns = ['barcode', 'sequence', 'read_count']

df.to_csv('doubledope_dna_hamming2.csv', sep=',', header=True, columns=new_columns, index=False)

2551978
