In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter, defaultdict
import sys
import matplotlib.pyplot as plt
from pybloom import BloomFilter

CONST_A = 0
CONST_C = 1
CONST_G = 2
CONST_T = 3

CONST_NT_MAP = ['A', 'C', 'G', 'T']

def reverse_complement(seq) :
    rc_seq = ''
    for i in range(0, len(seq)) :
        if seq[i] == 'A' :
            rc_seq = 'T' + rc_seq
        elif seq[i] == 'C' :
            rc_seq = 'G' + rc_seq
        elif seq[i] == 'G' :
            rc_seq = 'C' + rc_seq
        elif seq[i] == 'T' :
            rc_seq = 'A' + rc_seq
    return rc_seq

def remove_duplicates_round(df,hamm_thres=4,merge_counts=False):
    seqs = list(df.Seq.values)
    counts = list(df.Counts.values)
    c = 0
    while c<(len(counts)-1):
        if(distance(seqs[c],seqs[c+1]))<hamm_thres:
            if(counts[c]>counts[c+1]):
                if(merge_counts):
                    counts[c]+=counts[c+1]
                del counts[c+1],seqs[c+1]
            else:
                if(merge_counts):
                    counts[c+1]+=counts[c]
                del counts[c],seqs[c]
        else:
            c+=1
    return pd.DataFrame({'Seq':seqs,'Counts':counts})

def remove_all_duplicates(sequences,counts,hamming_thresh=4,merge_counts=False):
    df = pd.DataFrame({'Seq':sequences,'Counts':counts})
    seq_len = len(sequences[0])
    
    print('Removing hamming neighbors on dimension:')
    
    for i in range(seq_len):
        df = df.ix[(df.Seq.str.slice(seq_len-i)+df.Seq.str.slice(i)).sort_values().index]
        df = remove_duplicates_round(df,hamm_thres=hamming_thresh,merge_counts=merge_counts)
        print(i)
    return df



def key_shifted_sequence(seq, i) :
    seq_i = seq[i:] + seq[0:i]
    return seq_i

def hamming_cluster_library(sequences, counts, total_counts, hamming_thresh=4) :
    seq_len = len(sequences[0])
    
    print('Removing hamming neighbors on dimension:')
    for i in range(seq_len) :
        sequences = sorted(sequences, key=lambda x: key_shifted_sequence(x, i))
        
        sequences, counts = hamming_cluster_round(sequences, counts, total_counts, hamming_thresh)
        print(i)
    return sequences, counts

def hamming_cluster_round(seqs, counts, total_counts, hamming_thresh) :
    removed_set = {}
    
    i = 0
    j = 1
    while j < len(seqs) :
        
        if distance(seqs[i], seqs[j]) < hamming_thresh:
            
            if total_counts[seqs[i]] >= total_counts[seqs[j]] :
                removed_set[seqs[j]] = True
                for k in range(len(counts)) :
                    counts[k][seqs[i]] += counts[k][seqs[j]]
                
                j += 1
            else :
                removed_set[seqs[i]] = True
                for k in range(len(counts)) :
                    counts[k][seqs[j]] += counts[k][seqs[i]]
                
                i = j
                j += 1
                
        else :
            i = j
            j += 1
    
    hamming_seqs = []
    for seq in seqs :
        if seq not in removed_set :
            hamming_seqs.append(seq)
    return hamming_seqs, counts

def distance(astring, bstring) :
    distance = 0
    
    limit = len(astring)
    diff = len(bstring) - len(astring)
    if len(bstring) < len(astring) :
        limit = len(bstring)
        diff = len(astring) - len(bstring)
    
    for i in range(limit) :
        if astring[i] != bstring[i] :
            distance += 1
    return distance + diff

def is_misprimed(seq, start_pos) :
    
    if start_pos >= len(seq) - 10 :
        return False
    
    a_count = 0
    
    if seq[start_pos - 1] == 'A' :
        a_count += 1
    if seq[start_pos - 1] == 'A' and seq[start_pos - 2] == 'A' :
        a_count += 1
    
    for i in range(0, 10) :
        if seq[start_pos + i] == 'A' :
            a_count += 1
        
        if i < 4 and a_count >= 3 :
            return True
        if i < 7 and a_count >= 4 :
            return True
        elif i < 8 and a_count >= 5 :
            return True
        elif i < 9 and a_count >= 6 :
            return True
    return False

In [2]:
dna_file = pd.read_csv('apa_sym_prx_dna_hamming_20160825.csv',sep=',')

dna_barcode_list = list(dna_file.barcode)
dna_sequence_list = list(dna_file.sequence)

dna_barcode_map = {}
dna_sequence_map = {}
dna_identity_map = {}

for i in range(0, len(dna_barcode_list)) :
    dna_barcode_map[dna_barcode_list[i]] = dna_barcode_list[i]
    dna_sequence_map[dna_barcode_list[i]] = dna_sequence_list[i]
    dna_identity_map[dna_barcode_list[i]] = i


In [5]:
#Generate UMI Blooms filters

umi_blooms = {}

mer6_map = {}

bases = 'ACGTN'
i = 0
for base1 in bases:
    for base2 in bases:
        for base3 in bases:
            for base4 in bases:
                for base5 in bases:
                    for base6 in bases:
                        umi = base1 + base2 + base3 + base4 + base5 + base6
                        #f = BloomFilter(capacity=len(dna_identity_map), error_rate=0.001)
                        #umi_blooms[umi] = f
                        mer6_map[umi] = i
                        i += 1

print(len(umi_blooms))
print(len(mer6_map))

0
15625


In [7]:
rna_file = open('apa_sym_prx_mapped_rna_20160916.csv', 'r')
count = 0

mapped_count = {}
mapped_proximal1_count = {}
mapped_proximal2_count = {}
mapped_distal_count = {}

mapped_umi_map = {}

total_mapped_count = 0
total_valid_aligns = 0
total_unique_umi_count = 0
total_mapped_proximal1_count = 0
total_mapped_proximal2_count = 0
total_mapped_distal_count = 0

proximal1_limits = [56, 70]
proximal2_limits = [140, 154]
distal_limits = [304, 324]


print('Mapping RNA reads to DNA members.')


rna_file.readline()

i = 0

for k in range(1):
    for line in rna_file :
        i += 1
        if line == None :
            break
        
        lineparts = line.split(',')
        if len(lineparts) != 5 :
            break
        
        barcode = lineparts[0]
        umi = lineparts[1]
        polya_pos = int(lineparts[3])
        align_score = float(lineparts[4])
        
        seq = dna_sequence_map[barcode]
        
        if seq not in mapped_count:
            mapped_count[seq] = 0
            mapped_proximal1_count[seq] = 0
            mapped_proximal2_count[seq] = 0
            mapped_distal_count[seq] = 0
        
        if align_score >= 60 and not is_misprimed(seq, polya_pos + 1) :
            total_valid_aligns += 1
            
            if dna_identity_map[barcode] not in mapped_umi_map :
                mapped_umi_map[dna_identity_map[barcode]] = {}

            is_dup_umi = False

            #Exact match UMI
            #if umi in mapped_umi_map[seq] :
            #if dna_identity_map[barcode] in umi_blooms[umi] :
            #    is_dup_umi = True
            if mer6_map[umi] in mapped_umi_map[dna_identity_map[barcode]] :
                is_dup_umi = True
            
            if is_dup_umi == False :
                total_unique_umi_count += 1
                mapped_umi_map[dna_identity_map[barcode]][mer6_map[umi]] = True
                #umi_blooms[umi].add(dna_identity_map[barcode])
                
                if polya_pos >= proximal1_limits[0] and polya_pos <= proximal1_limits[1] :
                    total_mapped_proximal1_count += 1

                    mapped_proximal1_count[seq] += 1
                    mapped_count[seq] += 1
                elif polya_pos >= proximal2_limits[0] and polya_pos <= proximal2_limits[1] :
                    total_mapped_proximal2_count += 1

                    mapped_proximal2_count[seq] += 1
                    mapped_count[seq] += 1
                elif polya_pos >= distal_limits[0] and polya_pos <= distal_limits[1] :
                    total_mapped_distal_count += 1

                    mapped_distal_count[seq] += 1
                    mapped_count[seq] += 1
        
        
        if count % 1000000 == 0:
            print(count)
            print(str(total_valid_aligns) + ' valid read align count')
            print(str(total_unique_umi_count) + ' unique umi count')
            print(str(len(mapped_count)) + ' mapped unique library members')
            print(str(total_mapped_proximal1_count) + ' mapped proximal 1 RNA reads')
            print(str(total_mapped_proximal2_count) + ' mapped proximal 2 RNA reads')
            print(str(total_mapped_distal_count) + ' mapped distal RNA reads')
        count += 1
    
print('COMPLETE')
print(str(len(mapped_count)) + ' mapped unique library members')
print(str(total_mapped_proximal1_count) + ' mapped proximal 1 RNA reads')
print(str(total_mapped_proximal2_count) + ' mapped proximal 2 RNA reads')
print(str(total_mapped_distal_count) + ' mapped distal RNA reads')

rna_file.close()

mapped_seq_list = []
mapped_proximal1_list = []
mapped_proximal2_list = []
mapped_distal_list = []
mapped_count_list = []

for seq in mapped_count :
    if mapped_count[seq] > 0 :
        mapped_seq_list.append(seq)
        mapped_proximal1_list.append(mapped_proximal1_count[seq])
        mapped_proximal2_list.append(mapped_proximal2_count[seq])
        mapped_distal_list.append(mapped_distal_count[seq])
        mapped_count_list.append(mapped_count[seq])

print(str(len(mapped_seq_list)) + ' kept library members')

df = pd.DataFrame({'sequence'  : mapped_seq_list,
                   'proximal1_count'  : mapped_proximal1_list,
                   'proximal2_count'  : mapped_proximal2_list,
                   'distal_count' : mapped_distal_list,
                   'total_count' : mapped_count_list})

df = df.sort_values('total_count')

print(len(df))

new_columns = ['sequence', 'proximal1_count', 'proximal2_count', 'distal_count', 'total_count']
df.to_csv('apa_sym_prx_library_20160916.csv', sep=',', header=True, columns=new_columns, index=False)

Mapping RNA reads to DNA members.
0
0 valid read align count
0 unique umi count
1 mapped unique library members
0 mapped proximal 1 RNA reads
0 mapped proximal 2 RNA reads
0 mapped distal RNA reads
1000000
79615 valid read align count
79615 unique umi count
432420 mapped unique library members
11369 mapped proximal 1 RNA reads
3084 mapped proximal 2 RNA reads
51527 mapped distal RNA reads
2000000
174854 valid read align count
174854 unique umi count
628170 mapped unique library members
32025 mapped proximal 1 RNA reads
8153 mapped proximal 2 RNA reads
95766 mapped distal RNA reads
3000000
297591 valid read align count
297591 unique umi count
783576 mapped unique library members
52844 mapped proximal 1 RNA reads
13686 mapped proximal 2 RNA reads
163366 mapped distal RNA reads
4000000
426214 valid read align count
426214 unique umi count
911083 mapped unique library members
73762 mapped proximal 1 RNA reads
19443 mapped proximal 2 RNA reads
236235 mapped distal RNA reads
5000000
558834 v