# Map barcodes to enhancers, save as DF

In [None]:
import pickle
import pandas as pd
import numpy as np
import math

pd.set_option('display.max_columns', None)

# Load ordered enhancers

In [None]:
def read_tsv(fn,pc,header,breakBool=False,sep='\t',pc_list=False):
    '''Read a tsv file'''
    with open(fn,'r') as f:

        # If printing columns, skip header
        if header:
            if (pc==True): pass
            else:          next(f)

        for i,line in enumerate(f):
            a=line.strip().split(sep)
            if pc:
                if pc_list==False:
                    if i==0:
                        for i,c in enumerate(a):
                            print(i,c)
                        print()
                        if breakBool: break
                        continue
                else:
                    if i==0:
                        print(', '.join([i.replace('-','_').replace(' ','_') for i in a]))
                        if breakBool: break
                        continue
            yield a


In [None]:
#################################
# Load enhancer info
#################################

# Enhancer ID
fn='all_variants_mattoe_joe_ordered_20220127_with_controls_Gata4.tsv'
En2EnId={}

for row in read_tsv(fn,pc=False,header=True):
    name,seq=row
    seq=seq[25:-25]
    En2EnId[seq]=name

with open('En2Name.pydict.pickle','wb') as f:
    pickle.dump(En2EnId,f,protocol=pickle.HIGHEST_PROTOCOL)
        


# Load dict

In [None]:
##################################
# Read in enhancer barcode dict
##################################
    
# Load Enhancer-Barcode Dictionary (Unique BCs only)
# in_dict_name='k_neighbors_1.filt_ubc2en.x=5.CEQTL3-50L2-BC3'
in_dict_name='filt_ubc2en.x=2.CEQTL3-50L2-BC3'
in_dict=f'{in_dict_name}.pydict.pickle'
with open(in_dict,'rb') as f: Bc2En=pickle.load(f)


In [None]:
bcSet=[bc[:25] for bc in Bc2En]
print(len(bcSet),len(set(bcSet)))

In [None]:
def revcomp(dna):
        '''Takes DNA sequence as input and returns reverse complement'''
        inv={'A':'T','T':'A','G':'C','C':'G', 'N':'N','W':'W'}
        revcomp_dna=[]
        for nt in dna:
                revcomp_dna.append(inv[nt])
        return ''.join(revcomp_dna[::-1])


In [None]:
# see of the barcodes that conflict after trimming, do they have the same enhancer
Bc2EnSet={}
for bc,en in Bc2En.items():
    bc=revcomp(bc) # need to revcomp based on way dict is analyzed
    bc=bc[:25]
    if bc not in Bc2EnSet:
        Bc2EnSet[bc]=set()
    Bc2EnSet[bc].add(en)
    
len(Bc2EnSet)

In [None]:
len2count={}
for bc,enSet in Bc2EnSet.items():
    if len(enSet) not in len2count:
        len2count[len(enSet)]=0
    len2count[len(enSet)]+=1
    
len2count

In [None]:
# trim to 25bp and double check that no barcodes conflict after trimming
print('length before trim:',len(Bc2En))

Bc2En={revcomp(bc)[:25]:en for bc,en in Bc2En.items()}

print('length after trim:',len(Bc2En))

In [None]:
bc_in_order=list(Bc2En.keys())

# Choose barcode samples

In [None]:
# #################################
# # Choose samples
# #################################

sampleDir='./'
samplePickleFnList  = !ls {sampleDir}/seq1/*.Bc2ReadCount.pickle + !ls {sampleDir}/seq2/*.Bc2ReadCount.pickle
samplePickleFnList=samplePickleFnList[2:]
samplePickleFnList

In [None]:
##############################
# Instantiate dataframe pydict
##############################

Col2Values={'BC':[],'EN':[],'EN_ID':[]}#,'EN_LIB_NAME':[]}

In [None]:
Sample2ColName={
'./139_DNA_S161_L001_R1_001.Bc2ReadCount.pickle'         :'DNA_139-1',
'./139_RNA_S159_L001_R1_001.Bc2ReadCount.pickle'         :'RNA_139-1',
'./109-2-DNA_S6_L001_R1_001.Bc2ReadCount.pickle'      :'DNA_109-2',
'./109-2-RNA_S1_L001_R1_001.Bc2ReadCount.pickle'      :'RNA_109-2',
'./109-3-DNA_S7_L001_R1_001.Bc2ReadCount.pickle'      :'DNA_109-3',
'./109-3-RNA_S2_L001_R1_001.Bc2ReadCount.pickle'      :'RNA_109-3',
'./109-4-DNA_S8_L001_R1_001.Bc2ReadCount.pickle'      :'DNA_109-4',
'./109-4-RNA_S3_L001_R1_001.Bc2ReadCount.pickle'      :'RNA_109-4',
'./139-2-DNA_S9_L001_R1_001.Bc2ReadCount.pickle'      :'DNA_139-2',
'./139-2-RNA_S4_L001_R1_001.Bc2ReadCount.pickle'      :'RNA_139-2',
'./139-3-DNA_S10_L001_R1_001.Bc2ReadCount.pickle'     :'DNA_139-3',
'./139-3-RNA_S5_L001_R1_001.Bc2ReadCount.pickle'      :'RNA_139-3',
}    

for s in Sample2ColName:
    s=Sample2ColName[s]
    
    Col2Values[f'{s}_RPM']=[]
    Col2Values[f'{s}_RC' ]=[]
    
list(Col2Values.keys())

In [None]:
len(En2EnId)

In [None]:
#########################
# Add data from Bc/En
#########################

for bc in bc_in_order:
    en=Bc2En[bc]
    Col2Values['BC'].append(bc)
    Col2Values['EN'].append(en)
    Col2Values['EN_ID'].append(En2EnId[en])
    # Col2Values['EN_LIB_NAME'].append(Enhancer2LibName[en])

In [None]:
#########################
# Add data from RPM/RC
#########################

ColName2ReadcountsAndFilename={}

for sampleFn,sampleName in Sample2ColName.items():
    
    print(sampleName)
    
    # determine column names
    colRc = sampleName+'_RC'
    colRpm= sampleName+'_RPM'
    
    Col2Values[colRc]=[]
    Col2Values[colRpm]=[]
    
    # Load readcount dictionary
    with open(sampleFn,'rb') as f: bc2rc=pickle.load(f)
    totalReads=sum(bc2rc.values())
    
    # print('\t',sampleFn.split('/')[-1])
    # print('\t',f'{totalReads:,}')
    ColName2ReadcountsAndFilename[sampleName]=(totalReads,sampleFn)
    
    # for each barcode
    for bc in Col2Values['BC']:
    
        # determine read count
        if bc in bc2rc:                rc=bc2rc[bc]
        elif revcomp(bc) in bc2rc: rc=bc2rc[revcomp(bc)]
        else:                          rc=0
        
        # add read count
        Col2Values[colRc].append(rc)
    
        # add rpm 
        Col2Values[colRpm].append(1e6*rc/totalReads)
    



In [None]:
##################################
# Create DF from data
##################################

# Make sure all columns are same len
for col in Col2Values:
    print(col,len(Col2Values[col]))
   

In [None]:
 
# Make df
df=pd.DataFrame(Col2Values)
df.head()

In [None]:
del Col2Values

In [None]:
# only consider barcodes 25bp
print(len(df))
df=df.loc[df.BC.str.len()==25,:]
print(len(df))

In [None]:
def percent(number,rounding_digit=1):
    '''Get percent of fraction'''
    if rounding_digit==0:
        return str(int(100*number))+'%'
    else:
        return str(round(100*number,rounding_digit))+'%'

In [None]:
cols=[
'DNA_109-2',
'RNA_109-2',
'DNA_109-3',
'RNA_109-3',
'DNA_109-4',
'RNA_109-4',
'DNA_139-1',
'RNA_139-1',
'DNA_139-2',
'RNA_139-2',
'DNA_139-3',
'RNA_139-3',
]
for c in cols:
    reads=df[c+'_RC'].sum()
    readsTotal=ColName2ReadcountsAndFilename[c][0]
    percentUsed=percent(reads/readsTotal)
    print(c,f'{reads:,} ({percentUsed})')

In [None]:
for sampleName,(readcounts,fn) in ColName2ReadcountsAndFilename.items():
    print('\t'.join([sampleName,f'{readcounts:,}',fn]))

In [None]:
df.to_pickle(f'1.Bc2En2RPM.DictBcsOnly.{in_dict_name}.pd.pickle')