In [10]:
import pandas as pd
import numpy as np
import os
import regex as re
from collections import Counter, defaultdict
import sys

CONST_A = 0
CONST_C = 1
CONST_G = 2
CONST_T = 3

CONST_NT_MAP = ['A', 'C', 'G', 'T']

def reverse_complement(seq) :
    rc_seq = ''
    for i in range(0, len(seq)) :
        if seq[i] == 'A' :
            rc_seq = 'T' + rc_seq
        elif seq[i] == 'C' :
            rc_seq = 'G' + rc_seq
        elif seq[i] == 'G' :
            rc_seq = 'C' + rc_seq
        elif seq[i] == 'T' :
            rc_seq = 'A' + rc_seq
    return rc_seq

def remove_duplicates_round(df,hamm_thres=4,merge_counts=False):
    seqs = list(df.Seq.values)
    counts = list(df.Counts.values)
    c = 0
    while c<(len(counts)-1):
        if(distance(seqs[c],seqs[c+1]))<hamm_thres:
            if(counts[c]>counts[c+1]):
                if(merge_counts):
                    counts[c]+=counts[c+1]
                del counts[c+1],seqs[c+1]
            else:
                if(merge_counts):
                    counts[c+1]+=counts[c]
                del counts[c],seqs[c]
        else:
            c+=1
    return pd.DataFrame({'Seq':seqs,'Counts':counts})

def remove_all_duplicates(sequences,counts,hamming_thresh=4,merge_counts=False):
    df = pd.DataFrame({'Seq':sequences,'Counts':counts})
    seq_len = len(sequences[0])
    
    print('Removing hamming neighbors on dimension:')
    
    for i in range(seq_len):
        df = df.ix[(df.Seq.str.slice(seq_len-i)+df.Seq.str.slice(i)).sort_values().index]
        df = remove_duplicates_round(df,hamm_thres=hamming_thresh,merge_counts=merge_counts)
        print(i)
    return df

def distance(astring, bstring) :
    distance = 0
    
    limit = len(astring)
    diff = len(bstring) - len(astring)
    if len(bstring) < len(astring) :
        limit = len(bstring)
        diff = len(astring) - len(bstring)
    
    for i in range(limit) :
        if astring[i] != bstring[i] :
            distance += 1
    return distance + diff

def increment_bp_map(seq, bp_map, magnitude=1) :
    for i in range(0, len(seq)) :
        if seq[i] == 'A' :
            bp_map[i][CONST_A] += magnitude
        elif seq[i] == 'C' :
            bp_map[i][CONST_C] += magnitude
        elif seq[i] == 'G' :
            bp_map[i][CONST_G] += magnitude
        elif seq[i] == 'T' :
            bp_map[i][CONST_T] += magnitude
    return bp_map

def get_consensus_sequence(bp_map) :
    seq = ''
    for i in range(0, len(bp_map)) :
        max_count = 0
        max_j = 0
        for j in range(0, 4) :
            if bp_map[i][j] > max_count :
                max_count = bp_map[i][j]
                max_j = j
        seq += CONST_NT_MAP[max_j]
    return seq

def get_hamming_neighbor_1(seq, seq_map, start_r, end_r) :
    for i in range(start_r, end_r) :
        for base1 in CONST_NT_MAP :
            mut_seq = seq[:i] + base1 + seq[i+1:]
            if mut_seq in seq_map :
                return mut_seq
    return None

def get_hamming_neighbor_2(seq, seq_map, start_r, end_r) :
    for i in range(start_r, end_r) :
        for j in range(i + 1, end_r) :
            for base1 in CONST_NT_MAP :
                for base2 in CONST_NT_MAP :
                    mut_seq = seq[:i] + base1 + seq[i+1:j] + base2 + seq[j+1:]
                    if mut_seq in seq_map :
                        return mut_seq
    return None


In [3]:
r1 = 'r1_dna.fq'
r2 = 'r2_dna.fq'

In [4]:
tag_one = re.compile(r"(CATTACTCGCATCCA){s<=1}")
tag_two = re.compile(r"(CAGCCAATTAAGCC){s<=1}")

In [14]:
f = {}
f[0] = open(r1,'r')
f[1] = open(r2,'r')

head, seq, pr, q = ({} for i in range(4))
count = 0

dna_count_map = {}
dna_seq_map = {}

n_passed = 0
n_failed = 0

dna_bp_map = {}

matched_on_dict_count = 0
matched_on_hamming1_count = 0
matched_on_hamming2_count = 0

while True:
    for i in range(2):
        head[i] = f[i].readline()[:-1]
        seq[i] = f[i].readline()[:-1]
        pr[i] = f[i].readline()[:-1]
        q[i] = f[i].readline()[:-1]
    if len(seq[0]) == 0:
        break # End of File
    
    scan_one_r1   = re.search(tag_one, seq[0][20:35])
    scan_two_r1   = re.search(tag_two, seq[0][106:120])
    scan_one_r2rc = re.search(tag_one, seq[1][29:44])
    scan_two_r2rc = re.search(tag_two, seq[1][115:129])
    
    rc_seq = reverse_complement(seq[1])
    if scan_one_r1 != None and scan_two_r1 != None and distance(seq[0][:191], rc_seq[9:200]) <= 1:
        n_passed += 1
        
        sequence = seq[0][20:191]
        barcode = seq[0][:20]
        
        new_member = True
        barcode_key = barcode
        if barcode in dna_count_map :
            new_member = False
            matched_on_dict_count += 1
        else :
            barcode_h1 = get_hamming_neighbor_1(barcode, dna_count_map, 0, 20)
            if barcode_h1 != None :
                new_member = False
                barcode_key = barcode_h1
                matched_on_hamming1_count += 1
            '''else :
                barcode_h2 = get_hamming_neighbor_2(barcode, dna_count_map, 0, 20)
                if barcode_h2 != None :
                    new_member = False
                    barcode_key = barcode_h2
                    matched_on_hamming2_count += 1'''
        
        if new_member == True :
            dna_count_map[barcode_key] = 0
            dna_seq_map[barcode_key] = {}
                    
        dna_count_map[barcode_key] += 1
        
        if sequence not in dna_seq_map[barcode_key] :
            dna_seq_map[barcode_key][sequence] = 1
        else :
            dna_seq_map[barcode_key][sequence] += 1
    else :
        n_failed += 1
            
    if (count % 100000) == 0:
        print(count)
        print('Num dna reads passed: ' + str(n_passed))
        print('Num dna reads failed: ' + str(n_failed))
        print('Num unique barcodes: ' + str(len(dna_seq_map)))
        
        print('Matched on dictionary: ' + str(matched_on_dict_count))
        print('Matched on hamming 1: ' + str(matched_on_hamming1_count))
        print('Matched on hamming 2: ' + str(matched_on_hamming2_count))
    count += 1

print('COMPLETE')
print('Num dna reads passed: ' + str(n_passed))
print('Num dna reads failed: ' + str(n_failed))
print('Num unique barcodes: ' + str(len(dna_seq_map)))
        
print('Matched on dictionary: ' + str(matched_on_dict_count))
print('Matched on hamming 1: ' + str(matched_on_hamming1_count))
print('Matched on hamming 2: ' + str(matched_on_hamming2_count))

f[0].close()
f[1].close()

0
Num dna reads passed: 0
Num dna reads failed: 1
Num unique barcodes: 0
Matched on dictionary: 0
Matched on hamming 1: 0
Matched on hamming 2: 0
100000
Num dna reads passed: 51009
Num dna reads failed: 48992
Num unique barcodes: 50380
Matched on dictionary: 611
Matched on hamming 1: 18
Matched on hamming 2: 0
200000
Num dna reads passed: 111190
Num dna reads failed: 88811
Num unique barcodes: 108151
Matched on dictionary: 2969
Matched on hamming 1: 70
Matched on hamming 2: 0
300000
Num dna reads passed: 175943
Num dna reads failed: 124058
Num unique barcodes: 168529
Matched on dictionary: 7235
Matched on hamming 1: 179
Matched on hamming 2: 0
400000
Num dna reads passed: 241904
Num dna reads failed: 158097
Num unique barcodes: 228256
Matched on dictionary: 13296
Matched on hamming 1: 352
Matched on hamming 2: 0
500000
Num dna reads passed: 308247
Num dna reads failed: 191754
Num unique barcodes: 286447
Matched on dictionary: 21249
Matched on hamming 1: 551
Matched on hamming 2: 0
6000

In [15]:
seq_map = {}

for barcode_key in dna_seq_map :
    seq_bp_map = []
    
    for i in range(0, 171) :
        seq_bp_map.append([])
        for j in range(0, 4) :
            seq_bp_map[i].append(0)
    
    seq_list = list(dna_seq_map[barcode_key].keys())
    
    if dna_count_map[barcode_key] > 2 :
        for sequence in dna_seq_map[barcode_key] :
            seq_bp_map = increment_bp_map(sequence, seq_bp_map, magnitude=dna_seq_map[barcode_key][sequence])
        seq_map[barcode_key] = get_consensus_sequence(seq_bp_map)
    else :
        seq_map[barcode_key] = seq_list[0]


In [17]:
dna_barcode_list = []
dna_seq_list = []
dna_count_list = []

for barcode_key in dna_count_map :
    if dna_count_map[barcode_key] >= 2 :
        dna_barcode_list.append(barcode_key)
        dna_seq_list.append(seq_map[barcode_key])
        dna_count_list.append(dna_count_map[barcode_key])

df = pd.DataFrame({'barcode':    dna_barcode_list,
                   'sequence':   dna_seq_list,
                   'read_count': dna_count_list})

df = df.sort_values('read_count')

print(len(df))

new_columns = ['barcode', 'sequence', 'read_count']
df.to_csv('apa_sym_prx_dna_20160825.csv', sep=',', header=True, columns=new_columns, index=False)

2484516


In [19]:
dna_barcode_key_list = list(dna_count_map.keys())
dna_barcode_key_count_list = []
for barcode_key in dna_barcode_key_list :
    dna_barcode_key_count_list.append(dna_count_map[barcode_key])

print(len(dna_barcode_key_list))
print(len(dna_count_list))

hamming_thresh = 4

filtered_dna_df = remove_all_duplicates(dna_barcode_key_list, dna_barcode_key_count_list, hamming_thresh, merge_counts=False)

hamming_barcode_list = list(filtered_dna_df.Seq.values)
hamming_count_list = list(filtered_dna_df.Counts.values)

print(len(hamming_barcode_list))
print(len(hamming_count_list))
print('{:,}'.format(len(hamming_barcode_list)) + ' sequences with levenshtein d >= ' + str(hamming_thresh))


2917120
2484516
Removing hamming neighbors on dimension:
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
2845195
2845195
2,845,195 sequences with levenshtein d >= 4


In [20]:
filtered_barcode_list = []
filtered_seq_list = []
filtered_count_list = []

for barcode_key in hamming_barcode_list :
    if dna_count_map[barcode_key] >= 2 :
        filtered_barcode_list.append(barcode_key)
        filtered_seq_list.append(seq_map[barcode_key])
        filtered_count_list.append(dna_count_map[barcode_key])

df = pd.DataFrame({'barcode':    filtered_barcode_list,
                   'sequence':   filtered_seq_list,
                   'read_count': filtered_count_list})

df = df.sort_values('read_count')

print(len(df))

new_columns = ['barcode', 'sequence', 'read_count']
df.to_csv('apa_sym_prx_dna_hamming_20160825.csv', sep=',', header=True, columns=new_columns, index=False)

2434388
