In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import scipy.stats as stats
import sigfig as sf
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
def normalize_minmax_list(l):
    '''A method to do min max scaling for library.
    Note: 1) Returns np.NaN values when str(value)=='nan'
          2) Doesnt use np.NaN values to determine max/min'''
    
    # remove na entries
    disallowedEntries=set(['nan','inf','-inf'])
    l_for_min_max=[i for i in l if str(i) not in disallowedEntries]

    MIN=min(l_for_min_max)
    MAX=max(l_for_min_max)

    out=[]
    for i in l:
        # If real value
        if str(i) not in disallowedEntries: out.append((i-MIN) / (MAX-MIN))
        else:             out.append(np.NaN)
    return out

normalize_minmax_list([np.inf,-np.inf,0,1,2,3,np.NaN])

# Load barcode counts

In [None]:
dict_name='filt_ubc2en.x=2.CEQTL3-50L2-BC3'
fn=f'1.Bc2En2RPM.DictBcsOnly.{dict_name}.pd.pickle'

bcDF=pd.read_pickle(fn)
bcDF.head()

# Calculate barcode activity RNA/DNA

In [None]:
# Add barcode number within enhancers
barcodeNum=[]
en2count={}
for i in bcDF.index:
    en=bcDF.at[i,'EN']
    
    if en not in en2count:
        en2count[en]=0
        
    barcodeNum.append(en2count[en])
    
    en2count[en]+=1
    
bcDF['BC_NUM']=barcodeNum

In [None]:
bcDF.sort_values(["EN",'BC_NUM']).head(20)

In [None]:
for unit in ['RPM']:
    for rep in ['109-2','109-3','109-4','139-1','139-2','139-3']:
        bcDF[f'RATIO_{rep}_{unit}']      = bcDF[f'RNA_{rep}_{unit}'] / bcDF[f'DNA_{rep}_{unit}']
        bcDF[f'RATIO_{rep}_{unit}_LOG2'] = bcDF[f'RATIO_{rep}_{unit}'].apply(lambda i: np.log2(i))
        bcDF[f'RATIO_{rep}_{unit}_LOG2_NORM'] = normalize_minmax_list(bcDF[f'RATIO_{rep}_{unit}_LOG2'])

        bcDF[f'RATIO_{rep}_{unit}_TUPLE']      = bcDF.apply(lambda row:      (row['BC_NUM'],row[f'RNA_{rep}_{unit}'],row[f'DNA_{rep}_{unit}'],row[f'RATIO_{rep}_{unit}']),axis=1)
        bcDF[f'RATIO_{rep}_{unit}_LOG2_TUPLE'] = bcDF.apply(lambda row:      (row['BC_NUM'],row[f'RNA_{rep}_{unit}'],row[f'DNA_{rep}_{unit}'],row[f'RATIO_{rep}_{unit}_LOG2']),axis=1)
        bcDF[f'RATIO_{rep}_{unit}_LOG2_NORM_TUPLE'] = bcDF.apply(lambda row: (row['BC_NUM'],row[f'RNA_{rep}_{unit}'],row[f'DNA_{rep}_{unit}'],row[f'RATIO_{rep}_{unit}_LOG2_NORM']),axis=1)
    
bcDF.head()

In [None]:

bcDF.to_pickle(f'1b.Bc2En2RPM.DictBcsOnly.ActivityMeasurements.{dict_name}.pd.pickle')

# Assess reproducibility of RNA and DNA

In [None]:
def is_point_within_two_lines(x,y,p0,p1):
    
    y0=p0(x)
    y1=p1(x)
    
    if y>=y0 and y<=y1: return True
    else: return False
    
    
# is_point_within_two_lines(250,500,p0,p1)

In [None]:
comparisonList=[
    ('RNA_109-2_RPM','RNA_109-3_RPM'),
    ('RNA_109-2_RPM','RNA_109-4_RPM'),
    ('RNA_109-3_RPM','RNA_109-4_RPM'),
    
    ('RNA_139-1_RPM','RNA_139-2_RPM'),
    ('RNA_139-1_RPM','RNA_139-3_RPM'),
    ('RNA_139-2_RPM','RNA_139-3_RPM'),
]

for x,y in comparisonList:


    X=[]
    Y=[]
    omit=['nan','inf','-inf']
    for xi,yi in zip(bcDF[x].tolist(),bcDF[y].tolist()):
        if xi not in omit and yi not in omit:
            X.append(float(xi))
            Y.append(float(yi))

    # calculate equation for trendline
    X=np.array(X)
    Y=np.array(Y)

    z = np.polyfit(X,Y, 1)
    
    pr,pp=stats.pearsonr(X,Y)
    sr,sp=stats.spearmanr(X,Y)

    fig,ax=plt.subplots(1,figsize=(5,5),dpi=150)
    plt.scatter(X,Y,s=.7,color='red')
    plt.title(f'RNA Repro\nSpear. R={sf.round(sr,3)}\nPears. R={sf.round(pr,3)}')
    plt.xlabel(x)
    plt.ylabel(y)

# Calculate enhancer activity

## Paired RNA/DNA measurements

In [None]:

enDF=bcDF.loc[:,:]

In [None]:
enDF.head()

In [None]:
def list_minus_na(l):
    l=[i for i in l if str(i)!='nan']
    if len(l)>0:
        return l
    else:
        return []

In [None]:
AggregateKeys=['EN','EN_ID']
AggregateFxns = { col : list_minus_na for col in enDF.columns if col not in AggregateKeys}
enDF=enDF.groupby(AggregateKeys).agg(AggregateFxns).reset_index()
enDF.columns=[f'{col}_LIST' if col not in ['EN','EN_ID'] else col for col in enDF.columns ]
enDF.head()

In [None]:

enDF.to_pickle(f'2.En2Activity.FilterByMinDnaRpm.{dict_name}.pd.pickle')


In [None]:
enDF.columns.tolist()

## Enhancer Reproducibility

In [None]:
rnaMinRpm=75
dnaMinRpm=0

sampleList=[
 'RATIO_109-2_RPM_TUPLE_LIST',
 'RATIO_109-3_RPM_TUPLE_LIST',
 'RATIO_109-4_RPM_TUPLE_LIST',
 'RATIO_139-1_RPM_TUPLE_LIST',
 'RATIO_139-2_RPM_TUPLE_LIST',
 'RATIO_139-3_RPM_TUPLE_LIST']

for sample in sampleList:
    
    newCol=sample+'_FILT=MINRPM'
    
    #                                                  tup[0]=bc num, tup[1]=rna, tup[2]=dna, tup[3]=ratio
    enDF[newCol]=enDF[sample].apply(lambda tupList: [tup for tup in tupList if tup[1]>=rnaMinRpm and tup[2]>=dnaMinRpm])
    
    print(sample)



In [None]:
for si in sampleList:
    
    si+='_FILT=MINRPM'
    
    medianCol=si+'_BCMEDIAN'
    meanCol  =si+'_BCMEAN'
    
    #                                       t[3]=ratio
    enDF[medianCol]=enDF[si].apply(lambda tupList: np.median([t[3] for t in tupList]))
    enDF[meanCol]  =enDF[si].apply(lambda tupList: np.mean(  [t[3] for t in tupList]))
    
    print(si,medianCol)

In [None]:
[c for c in enDF.columns if 'MEAN' in c]

In [None]:
comarisonList= \
 ('RATIO_109-2_RPM_TUPLE_LIST_FILT=MINRPM','RATIO_109-3_RPM_TUPLE_LIST_FILT=MINRPM'),
 ('RATIO_109-2_RPM_TUPLE_LIST_FILT=MINRPM','RATIO_109-4_RPM_TUPLE_LIST_FILT=MINRPM'),
 ('RATIO_109-3_RPM_TUPLE_LIST_FILT=MINRPM','RATIO_109-4_RPM_TUPLE_LIST_FILT=MINRPM'),
 
 ('RATIO_139-1_RPM_TUPLE_LIST_FILT=MINRPM', 'RATIO_139-2_RPM_TUPLE_LIST_FILT=MINRPM'),
 ('RATIO_139-1_RPM_TUPLE_LIST_FILT=MINRPM', 'RATIO_139-3_RPM_TUPLE_LIST_FILT=MINRPM'),
 ('RATIO_139-2_RPM_TUPLE_LIST_FILT=MINRPM', 'RATIO_139-3_RPM_TUPLE_LIST_FILT=MINRPM')]

s_medianCorrs=[]
p_medianCorrs=[]
s_meanCorrs  =[]
p_meanCorrs  =[]
for aggfunc in ['_BCMEAN','_BCMEDIAN']:
    for x,y in comarisonList:

        x+=aggfunc
        y+=aggfunc
        
        X=[]
        Y=[]
        omit=['nan','inf','-inf']
        for xi,yi in zip(enDF[x].tolist(),enDF[y].tolist()):
            if str(xi) not in omit and str(yi) not in omit:
                X.append(float(xi))
                Y.append(float(yi))

        # calculate equation for trendline
        X=np.array(X)
        Y=np.array(Y)

        # z = np.polyfit(X,Y, 1)

        pr,pp=stats.pearsonr(X,Y)
        sr,sp=stats.spearmanr(X,Y)
        
        if aggfunc=='_BCMEDIAN':
            s_medianCorrs.append(sr)
            p_medianCorrs.append(pr)
        elif aggfunc=='_BCMEAN':
            s_meanCorrs  .append(sr)
            p_meanCorrs  .append(pr)
        

        fig,ax=plt.subplots(1,figsize=(5,5),dpi=150)
        plt.scatter(X,Y,s=.7,color='black')
        
        xtitle=x.split('RATIO_')[-1].split('_RPM_TUPLE')[0]
        ytitle=y.split('RATIO_')[-1].split('_RPM_TUPLE')[0]
        
        plt.title(f'{xtitle} vs {ytitle}\nSpear. R={sf.round(sr,3)}\nPears. R={sf.round(pr,3)}')
        plt.xlabel(x)
        plt.ylabel(y)
        

In [None]:
data=[s_medianCorrs,
p_medianCorrs,
s_meanCorrs  ,
p_meanCorrs  ]

fig,ax=plt.subplots(1,figsize=(5,5),dpi=150)

sns.swarmplot(data=data,ax=ax)

ax.set_xticklabels(['Spearman\nMedian','Pearson\nMedian','Spearman\nMean','Pearson\nMean',])
ax.set_ylim(0,1)
ax.set_ylabel('R correlation')