In [1]:
import pandas as pd
import numpy as np
import os
import regex as re
from collections import Counter, defaultdict
import sys

CONST_A = 0
CONST_C = 1
CONST_G = 2
CONST_T = 3

CONST_NT_MAP = ['A', 'C', 'G', 'T']

def remove_duplicates_round(df,hamm_thres=4,merge_counts=False):
    seqs = list(df.Seq.values)
    counts = list(df.Counts.values)
    c = 0
    while c<(len(counts)-1):
        if(distance(seqs[c],seqs[c+1]))<hamm_thres:
            if(counts[c]>counts[c+1]):
                if(merge_counts):
                    counts[c]+=counts[c+1]
                del counts[c+1],seqs[c+1]
            else:
                if(merge_counts):
                    counts[c+1]+=counts[c]
                del counts[c],seqs[c]
        else:
            c+=1
    return pd.DataFrame({'Seq':seqs,'Counts':counts})

def remove_all_duplicates(sequences,counts,hamming_thresh=4,merge_counts=False):
    df = pd.DataFrame({'Seq':sequences,'Counts':counts})
    seq_len = len(sequences[0])
    
    print('Removing hamming neighbors on dimension:')
    
    for i in range(seq_len):
        df = df.ix[(df.Seq.str.slice(seq_len-i)+df.Seq.str.slice(i)).sort_values().index]
        df = remove_duplicates_round(df,hamm_thres=hamming_thresh,merge_counts=merge_counts)
        print(i)
    return df

def distance(astring, bstring) :
    distance = 0
    
    limit = len(astring)
    diff = len(bstring) - len(astring)
    if len(bstring) < len(astring) :
        limit = len(bstring)
        diff = len(astring) - len(bstring)
    
    for i in range(limit) :
        if astring[i] != bstring[i] :
            distance += 1
    return distance + diff





In [2]:
dna_file = pd.read_csv('apa_nextseq_v2_dna_filtered_20160922.csv',sep=',')
distal_rna_file = pd.read_csv('apa_nextseq_v2_distal_rna_lessfiltered_20160922.csv',sep=',')

dna_upstream_list = list(dna_file.upstream_seq)
dna_downstream_list = list(dna_file.downstream_seq)
dna_pas_list = list(dna_file.pas_seq)
dna_seq_list = list(dna_file.seq)
dna_library_list = list(dna_file.library)
dna_count_list = list(dna_file.read_count)
dna_downstream_count_list = list(dna_file.unique_downstream_seq_count)
dna_pas_count_list = list(dna_file.unique_pas_seq_count)

distal_rna_upstream_list = list(distal_rna_file.upstream_seq)
distal_rna_downstream_list = list(distal_rna_file.downstream_seq)
distal_rna_pas_list = list(distal_rna_file.pas_seq)
distal_rna_seq_list = list(distal_rna_file.seq)
distal_rna_library_list = list(distal_rna_file.library)
distal_rna_count_list = list(distal_rna_file.read_count)
distal_rna_downstream_count_list = list(distal_rna_file.unique_downstream_seq_count)
distal_rna_pas_count_list = list(distal_rna_file.unique_pas_seq_count)

dna_upstream_map = {}
dna_downstream_map = {}
dna_pas_map = {}
dna_seq_map = {}
dna_library_map = {}
dna_count_map = {}
dna_downstream_count_map = {}
dna_pas_count_map = {}
dna_or_distal_rna_map = {}

for i in range(0, len(dna_upstream_list)) :
    dna_upstream_map[dna_upstream_list[i]] = dna_upstream_list[i]
    dna_downstream_map[dna_upstream_list[i]] = dna_downstream_list[i]
    dna_pas_map[dna_upstream_list[i]] = dna_pas_list[i]
    dna_seq_map[dna_upstream_list[i]] = dna_seq_list[i]
    dna_library_map[dna_upstream_list[i]] = int(dna_library_list[i])
    dna_count_map[dna_upstream_list[i]] = int(dna_count_list[i])
    dna_downstream_count_map[dna_upstream_list[i]] = int(dna_downstream_count_list[i])
    dna_pas_count_map[dna_upstream_list[i]] = int(dna_pas_count_list[i])
    dna_or_distal_rna_map[dna_upstream_list[i]] = 'dna'

for i in range(0, len(distal_rna_upstream_list)) :
    if distal_rna_upstream_list[i] not in dna_upstream_map :
        dna_upstream_map[distal_rna_upstream_list[i]] = distal_rna_upstream_list[i]
        dna_downstream_map[distal_rna_upstream_list[i]] = distal_rna_downstream_list[i]
        dna_pas_map[distal_rna_upstream_list[i]] = distal_rna_pas_list[i]
        dna_seq_map[distal_rna_upstream_list[i]] = distal_rna_seq_list[i]
        dna_library_map[distal_rna_upstream_list[i]] = int(distal_rna_library_list[i])
        dna_count_map[distal_rna_upstream_list[i]] = int(distal_rna_count_list[i])
        dna_downstream_count_map[distal_rna_upstream_list[i]] = int(distal_rna_downstream_count_list[i])
        dna_pas_count_map[distal_rna_upstream_list[i]] = int(distal_rna_pas_count_list[i])
        dna_or_distal_rna_map[distal_rna_upstream_list[i]] = 'distal_rna'

In [3]:
dna_upstream_key_list = list(dna_count_map.keys())

dna_count_list = []
for upstream in dna_upstream_key_list :
    dna_count_list.append(dna_count_map[upstream])

print(len(dna_upstream_key_list))

hamming_thresh = 5

filtered_dna_df = remove_all_duplicates(dna_upstream_key_list, dna_count_list, hamming_thresh, merge_counts=False)

hamming_upstream_list = list(filtered_dna_df.Seq.values)
hamming_count_list = list(filtered_dna_df.Counts.values)

print(len(hamming_upstream_list))
print(len(hamming_count_list))
print('{:,}'.format(len(hamming_upstream_list)) + ' sequences with levenshtein d >= ' + str(hamming_thresh))


1179765
Removing hamming neighbors on dimension:
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
1134457
1134457
1,134,457 sequences with levenshtein d >= 5


In [4]:
filtered_upstream_list = []
filtered_downstream_list = []
filtered_pas_list = []
filtered_seq_list = []
filtered_count_list = []
filtered_lib_list = []

for upstream in hamming_upstream_list :
    if dna_or_distal_rna_map[upstream] == 'dna' and (dna_count_map[upstream] > 2 or (dna_count_map[upstream] == 2 and dna_downstream_count_map[upstream] == 1)) :
        filtered_upstream_list.append(dna_upstream_map[upstream])
        filtered_downstream_list.append(dna_downstream_map[upstream])
        filtered_pas_list.append(dna_pas_map[upstream])
        filtered_seq_list.append(dna_seq_map[upstream])
        filtered_count_list.append(dna_count_map[upstream])
        filtered_lib_list.append(dna_library_map[upstream])
    
    if dna_or_distal_rna_map[upstream] == 'distal_rna' and (dna_count_map[upstream] > 5 or (dna_count_map[upstream] > 4 and dna_downstream_count_map[upstream] == 2)) :
        filtered_upstream_list.append(dna_upstream_map[upstream])
        filtered_downstream_list.append(dna_downstream_map[upstream])
        filtered_pas_list.append(dna_pas_map[upstream])
        filtered_seq_list.append(dna_seq_map[upstream])
        filtered_count_list.append(dna_count_map[upstream])
        filtered_lib_list.append(dna_library_map[upstream])

print(len(filtered_upstream_list))

df = pd.DataFrame({'upstream_seq':   filtered_upstream_list,
                   'pas_seq':            filtered_pas_list,
                   'downstream_seq':            filtered_downstream_list,
                   'seq':            filtered_seq_list,
                   'library' :              filtered_lib_list,
                   'read_count':     filtered_count_list})

df = df.sort_values('read_count')

new_columns = ['upstream_seq', 'pas_seq', 'downstream_seq', 'seq', 'library', 'read_count']
df.to_csv('apa_nextseq_v2_dna_merged_20160922.csv', sep=',', header=True, columns=new_columns, index=False)


lib_summary = [0, 0, 0, 0, 0, 0]
for lib in filtered_lib_list :
    lib_summary[lib] += 1

for i in range(0, len(lib_summary)) :
    print('Member count for library ' + str(i) + ': ' + str(lib_summary[i]))

1102005
Member count for library 0: 226063
Member count for library 1: 97691
Member count for library 2: 254088
Member count for library 3: 214623
Member count for library 4: 67313
Member count for library 5: 242227
