In [None]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import js # github commit 32863a0 https://github.com/jsolvason/js date May 19 2023
from matplotlib_venn import venn3


import scipy.stats as stats

import matplotlib.ticker as mtick

import sigfig as sf

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
dictname='filt_ubc2en.x=2.CEQTL3-50L2-BC3'
pairedActDF=pd.read_pickle(f'2.En2Activity.FilterByMinDnaRpm.{dictname}.pd.pickle')
pairedActDF.head()

# Asses repro of enhancer measurements

In [None]:
# Filter params!!!!!
rnaMin=75
dnaMin=0

sampleList=[
 'RATIO_109-1_RPM_TUPLE_LIST',
 'RATIO_109-2_RPM_TUPLE_LIST',
 'RATIO_109-3_RPM_TUPLE_LIST',
 'RATIO_109-4_RPM_TUPLE_LIST',
 'RATIO_139-1_RPM_TUPLE_LIST',
 'RATIO_139-2_RPM_TUPLE_LIST',
 'RATIO_139-3_RPM_TUPLE_LIST']

x_jitter_adj=.3



for idx in pairedActDF.index:
    
    enid=pairedActDF.at[idx,'EN_ID']
    
    if 'HG9' not in enid: continue
    
    # plot measurements
    fig,ax=plt.subplots(1,figsize=(13,5),dpi=150)
    
    for x_pos,sample in enumerate(sampleList):
    
        # Get data
        r1ListTuples=pairedActDF.at[idx,sample]

        rnaPointSizeAdj=.1

        # Format
        r1Y  =[t[3] for t in r1ListTuples if t[1]>rnaMin]
        r1Rna=[t[1]*rnaPointSizeAdj for t in r1ListTuples if t[1]>rnaMin]
        r1Bc =[t[0] for t in r1ListTuples]

        # average reps
        r1bc2activity={bc:ratio for bc,rna,dna,ratio in r1ListTuples}
        r1bc2rna={bc:rna for bc,rna,dna,ratio in r1ListTuples}

        x_jitter=np.random.uniform(low=-x_jitter_adj,high=x_jitter_adj,size=len(r1Y))
        x=[x_pos+xi for xi in x_jitter]
        
        ax.scatter(x,r1Y,s=r1Rna)
        
    ax.set_title(enid)
    # ax.set_xlim(-1,3)
    # ax.set_xticks([0,1,2])
    ax.set_xticklabels(['']+[si.split('-')[0] for si in sampleList])





# MERGE by average and FILTER by min RNA and DNA

In [None]:
for sample in sampleList:
    
    newCol=sample+'_FILT=MINRPM'
    
    #                                                  tup[0]=bc num, tup[1]=rna, tup[2]=dna, tup[3]=ratio
    pairedActDF[newCol]=pairedActDF[sample].apply(lambda tupList: [tup for tup in tupList if tup[1]>=rnaMin and tup[2]>=dnaMin])
    
    print(sample)

In [None]:
# !!!!!!!!! combine all mesurements using BCCM
# RATIO_109-1_RPM_TUPLE_LIST_FILT=MINRPM

sampeListFilt=[
 'RATIO_109-1_RPM_TUPLE_LIST_FILT=MINRPM',
 'RATIO_109-2_RPM_TUPLE_LIST_FILT=MINRPM',
 'RATIO_109-3_RPM_TUPLE_LIST_FILT=MINRPM',
 'RATIO_109-4_RPM_TUPLE_LIST_FILT=MINRPM',
 'RATIO_139-1_RPM_TUPLE_LIST_FILT=MINRPM',
 'RATIO_139-2_RPM_TUPLE_LIST_FILT=MINRPM',
 'RATIO_139-3_RPM_TUPLE_LIST_FILT=MINRPM']



bccmRatios=[]

for idx in pairedActDF.index:
    
    enMeasurements=[]
    for rpmTupleListFiltered in sampeListFilt:
        
        tuplesToAdd = pairedActDF.at[idx,rpmTupleListFiltered]
        
        for tup in tuplesToAdd:
            enMeasurements.append(tup)
            
    bccmRatios.append(enMeasurements)
    
pairedActDF['RATIO_BCCM_RPM_TUPLE_LIST_FILT=MINRPM']=bccmRatios
        

In [None]:
pairedActDF.to_pickle(f'3.En2Activity.FilterByMinRna={rnaMin}_Dna={dnaMin}.WithCombinedMeasuremnts.filt_ubc2en.x=5.CEQTL3-50L2-BC3.pd')

# Begin Plotting

## Prep data for plotting

In [None]:
newEnIdList=[]

for enid in pairedActDF.EN_ID:
    if '-' not in enid: 
        newEnIdList.append(enid)
        continue
    en,allele=enid.split('-')
    
    # Make 01 instead of 1
    num=en[2:]
    en=en[:2]+num.zfill(2)
    newEnIdList.append(f'{en}-{allele}')
    
pairedActDF['EN_ID_00']=newEnIdList
pairedActDF=pairedActDF.set_index('EN_ID_00')
pairedActDF.head()

In [None]:
# prep comparisons

En2Comparison={}
En2Comparison['HG9opt']=['HG9','HG9opt']
En2Comparison['HG9optSNV']=['HG9','HG9optSNV']

sortedEnList=sorted(En2Comparison)
sortedEnList

## T test

In [None]:
ttest2vartrue={'standard':True}
vartrue2ttest={k:v for v,k in ttest2vartrue.items()}
ttest_method='standard'
pointsize='rna'


In [None]:
cols=['en','ttest-type','alternative','t','p']
c2v={ci:[] for ci in cols}

alternative='less'
varSame=True
rep='BCCM'
dist=10000

Rep2En2Dist2VarSame2Ttest={'BCCM':{}}
Rep2En2Wilcox={'BCCOM':{}}

for en in sortedEnList:

    refid,altid=En2Comparison[en]        

    if en not in Rep2En2Dist2VarSame2Ttest[rep]:
        Rep2En2Dist2VarSame2Ttest[rep][en]={dist:{varSame:tuple() for varSame in [True,False]} for dist in distList}

    refList=[tup[3] for tup in pairedActDF.at[refid,f'RATIO_{rep}_RPM_TUPLE_LIST_FILT=MINRPM']]
    altList=[tup[3] for tup in pairedActDF.at[altid,f'RATIO_{rep}_RPM_TUPLE_LIST_FILT=MINRPM']]

    refList=[i for i in refList if str(i) not in ['inf','-inf','nan']]
    altList=[i for i in altList if str(i) not in ['inf','-inf','nan']]

    t,p=stats.ttest_ind(refList,altList,
                                equal_var=varSame,
                                permutations=dist,
                                nan_policy='omit',
                                alternative=alternative)

    c2v['en'].append(en)
    c2v['t'].append(t)
    c2v['p'].append(p)
    c2v['ttest-type'].append((dist,vartrue2ttest[varSame],rep))
    c2v['alternative'].append(alternative)

    Rep2En2Dist2VarSame2Ttest[rep][en][dist][varSame]=(t,p)

                
ttestDF=pd.DataFrame(c2v)
ttestDF.head()


## Assess results

### Assess testing ACROSS bio reps

In [None]:
ttestDF['library']=ttestDF['en'].apply(lambda s: s[:2])
ttestDF['library'].value_counts()


In [None]:
# plot count data and significance of all 4 tests

boxplots=False
violin=False

barplots=True
barplots_norm_hg9_to_1=True

np.random.seed(0)

alt='two-sided'

colors=['lightgrey','lightcoral','red']
    
refid,altid,altid2=['HG9','HG9optSNV','HG9opt']

fig,ax=plt.subplots(1,figsize=(5,5),dpi=300)

allY=[]
violinData=[]
for rep_xi, rep in [(4,'BCCM')]:#(0,'109'),(2,'139'),

    for idx_xi,idx in [(0,refid),(1,altid),(2,altid2)]:

        # Get data
        r1ListTuples=pairedActDF.at[idx,f'RATIO_{rep}_RPM_TUPLE_LIST_FILT=MINRPM']
        
        print("r1ListTuples")
        print(r1ListTuples)

        rnaPointSizeAdj=.05
        dnaAlphaAdj=1

        # Format
        r1Y  =[t[3] for t in r1ListTuples]
        r1Rna=[t[0]*rnaPointSizeAdj for t in r1ListTuples]
        r1DnaPointsize=[t[1]*rnaPointSizeAdj for t in r1ListTuples]

        allY+=r1Y
        violinData.append(r1Y)

        r1Dna=[t[1] for t in r1ListTuples]

        r1pointsize2data={'rna':r1Rna,'dna':r1DnaPointsize}

        # normalize r1Dna and r2Dna between 0 and 1
        maxadj=.3
        # if len(r1Dna)==1: continue
        # r1Dna=[min(1,maxadj+(i-min(r1Dna))/(max(r1Dna)-min(r1Dna))) for i in r1Dna]

        # plot measurements

        if 'ref'   in idx: color='grey'
        elif 'alt' in idx: color='red'
        elif 'opt' in idx: color='red'
        else:              color='grey'

        x_jitter_adj=.2


if boxplots:
    sns.boxplot(data=violinData,palette=colors,width=.6,fliersize=0)
    meanY=np.mean(violinData[0]+violinData[1])
    maxY=meanY+1
    minY=meanY-1
    
if barplots:
    if barplots_norm_hg9_to_1:
        hg9=np.mean(violinData[0])
        ax.bar(x=range(len(violinData)),height=[np.mean(di)/hg9 for di in violinData],color=colors)
        ax.errorbar(x=range(len(violinData)),y=[np.mean(di)/hg9 for di in violinData],yerr=[stats.sem(di) for di in violinData],ls='none',color='black',capsize=10,)
        
    else:
        ax.bar(x=range(len(violinData)),height=[np.mean(di) for di in violinData],color=colors)
        ax.errorbar(x=range(len(violinData)),y=[np.mean(di) for di in violinData],yerr=[stats.sem(di) for di in violinData],ls='none',color='black',capsize=10,)

if violin:
    sns.violinplot(data=violinData,inner=None,color='white',width=.7,scale='area',cut=0)
    sns.swarmplot(data=violinData,palette=colors*4,size=2)
    maxy=max(allY)+.5
    miny=min(allY)-.5


ax.set_xticks([0,1,2])
ax.set_xticklabels([refid,altid,altid2])
ax.spines['top']  .set_visible(False)
ax.spines['right'].set_visible(False)

ax.set_ylim(.95,1.45)
ax.set_yticks([1,1.1,1.2,1.3,1.4])


plt.show()    




In [None]:
# Calculate correlation between repeats
#print(sampeListFilt) # Names of the different repeats

# Commented out since already defined above, but copied here for convenience
#refid,altid,altid2=['HG9','HG9optSNV','HG9opt']

# Normalize by mean of hg9
if barplots_norm_hg9_to_1:
    hg9=np.mean(violinData[0])
else:
    hg9=1

ratios_for_each_repeat = []
for repeat_name in sampeListFilt:
    ratios_for_each_genotype = []
    for idx_xi,idx in [(0,refid),(1,altid),(2,altid2)]:
        ratios = [t[3]/hg9 for t in pairedActDF.at[idx,repeat_name]]
        avg_ratio = np.mean(np.array(ratios))
        ratios_for_each_genotype.append(avg_ratio)
    ratios_for_each_repeat.append(np.array(ratios_for_each_genotype))
print(ratios_for_each_repeat)
correlation_table = np.corrcoef(ratios_for_each_repeat)
print(correlation_table)
np.savetxt("data/corrcoeff_mat_avg_enhancer.txt", correlation_table, fmt='%.5f')