In [1]:
%reset
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import glob
import scipy.stats
import math
import statsmodels.stats.multitest as multest
import statsmodels.sandbox.stats.multicomp as mulcomp

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


### compute TCP (Therapeutically Correlated Pair) scores with a formula of "Z observation"

In [2]:
def get_z(r):
    if r==1:
        r=0.9999
    if r==-1:
        r=-0.9999
    return (math.log(1+r)-math.log(1-r))*0.5

def get_corr_unstack(df1,col_name):
    cor_df=df1.corr('pearson')
    cor_df=cor_df.unstack().to_frame()
    cor_df.columns=[col_name]
    cor_df.index.names=[None, None]
    return cor_df
    
file_names=glob.glob('result/TF_activity_viper/*_v.txt')  

corr_merged_df = pd.DataFrame()
for file_name in file_names:
    cell=file_name.split("_")[-2]
    print('#####',cell)
    
    anal_data=pd.read_table(file_name, sep='\t',engine='python')
    
    dataE=anal_data.loc[anal_data['effect']==1, [col for col in anal_data.columns if col!='effect']]
    dataIE=anal_data.loc[anal_data['effect']==0, [col for col in anal_data.columns if col!='effect']]
    
    corrE=get_corr_unstack(dataE, 'corrE')
    corrIE=get_corr_unstack(dataIE, 'corrIE')

    corr_df=pd.concat([corrE,corrIE], axis=1)
   
    corr_df['corrEZ']=corr_df['corrE'].map(get_z)
    corr_df['corrIEZ']=corr_df['corrIE'].map(get_z)
    
    denorm_of_zobs=np.sqrt(1/(len(dataE)-3)+1/(len(dataIE)-3))
    corr_df['Zobs(TCPS)']=corr_df.apply(lambda row: (row['corrEZ']-row['corrIEZ'])/denorm_of_zobs, axis=1)
    corr_df['p_value']=corr_df['Zobs(TCPS)'].map(lambda x: scipy.stats.norm.sf(abs(x))*2)

    corr_df['FDR'] = multest.fdrcorrection(corr_df['p_value'])[1]
    #corr_df['FDR'] = mulcomp.multipletests(corr_df['p_value'], method='bonferroni')[1]
    corr_df['negLog10_FDR'] = corr_df['FDR'].map(lambda x: -np.log10(x))
    
    ## represent data with short format
    for col in ['corrE','corrIE','corrEZ','corrIEZ','Zobs(TCPS)','negLog10_FDR']:
        corr_df[col] = corr_df[col].map(lambda x: '{:.5f}'.format(x))
    for col in ['p_value','FDR']:
        corr_df[col] = corr_df[col].map(lambda x: '{:.5e}'.format(x))
    
    ## change columns with cell name
    corr_df.columns=corr_df.columns+'_'+cell
    corr_merged_df=pd.concat([corr_merged_df, corr_df], axis=1, sort=True)

corr_merged_df.reset_index(inplace=True)
corr_merged_df.columns=['TF1','TF2']+list(corr_merged_df.columns[2:])

corr_merged_df.to_csv('result/TCP_score_viper/TCPS_merged_with_duplicates.txt',index=False, sep='\t')

##### A375
##### HT29


### remove duplicates of TCP scores

In [3]:
dupDel=open('result/TCP_score_viper/TCPS_merged.txt','w+')

tfPair=set()

line_cnt=0
dup_file=open('result/TCP_score_viper/TCPS_merged_with_duplicates.txt','r')
for line in dup_file:
    line_cnt+=1
    if line_cnt==1:
        _=dupDel.write(line)
        continue
        
    tf1,tf2=line.split('\t')[:2]
    
    if tf1==tf2:
        continue
        
    if (tf1,tf2) not in tfPair:
        _=dupDel.write(line)
        tfPair.add((tf2,tf1))
    else:
        tfPair.remove((tf1,tf2))
        
dupDel.close()

### characterize significant TCPs with a certain FDR

In [4]:
TCPS_FDR=0.05

sig_TCP=open('result/TCP_score_viper/sig_TCP.txt','w')
sig_pos_TCP_both=[]
sig_neg_TCP_both=[]

TCPS=pd.read_table('result/TCP_score_viper/TCPS_merged.txt', sep='\t',engine='python')
TCPS.set_index(['TF1','TF2'], inplace=True)

for cell in ['A375','HT29']:
    sig_pos_TCP=list(TCPS.loc[(TCPS['FDR_{}'.format(cell)]<TCPS_FDR)&(TCPS['Zobs(TCPS)_{}'.format(cell)]>=0)].index)
    sig_pos_TCP=['|'.join(sorted(TF_pair)) for TF_pair in sig_pos_TCP]
    
    sig_neg_TCP=list(TCPS.loc[(TCPS['FDR_{}'.format(cell)]<TCPS_FDR)&(TCPS['Zobs(TCPS)_{}'.format(cell)] <0)].index)
    sig_neg_TCP=['|'.join(sorted(TF_pair)) for TF_pair in sig_neg_TCP]
    
    _=sig_TCP.write('{}(pos)\t{}\t{}\n'.format(cell, len(sig_pos_TCP), ','.join(sig_pos_TCP)))
    _=sig_TCP.write('{}(neg)\t{}\t{}\n'.format(cell, len(sig_neg_TCP), ','.join(sig_neg_TCP)))
    
    sig_pos_TCP_both+=sig_pos_TCP
    sig_neg_TCP_both+=sig_neg_TCP

sig_pos_TCP_both=[TCP for TCP in set(sig_pos_TCP_both) if sig_pos_TCP_both.count(TCP)==2]
sig_neg_TCP_both=[TCP for TCP in set(sig_neg_TCP_both) if sig_neg_TCP_both.count(TCP)==2]

_=sig_TCP.write('both(pos)\t{}\t{}\n'.format(len(sig_pos_TCP_both), ','.join(sig_pos_TCP_both)))
_=sig_TCP.write('both(neg)\t{}\t{}\n'.format(len(sig_neg_TCP_both), ','.join(sig_neg_TCP_both)))

sig_TCP.close()

### filter in the significant TCPs in the both cells

In [5]:
sig_TCP=pd.read_table('result/TCP_score_viper/sig_TCP.txt', sep='\t', index_col=0)
pos_TCP=sig_TCP.loc['both(pos)'].iloc[1].split(',')
pos_TCP=[tuple(TCP.split('|')) for TCP in pos_TCP]
neg_TCP=sig_TCP.loc['both(neg)'].iloc[1].split(',')
neg_TCP=[tuple(TCP.split('|')) for TCP in neg_TCP]

TCP_all_data=pd.read_table('result/TCP_score_viper/TCPS_merged.txt', index_col=[0,1], sep='\t')
TCP_flt_data=TCP_all_data.loc[pos_TCP+neg_TCP]

## calculate the sum of the both FDR and sort by the FDR sum
TCP_flt_data['negLog10_FDR_SUM']=TCP_flt_data['negLog10_FDR_A375']+TCP_flt_data['negLog10_FDR_HT29']
TCP_flt_data.sort_values(by=['negLog10_FDR_SUM'], ascending=False, inplace=True)
TCP_flt_data['negLog10_FDR_SUM']=TCP_flt_data['negLog10_FDR_SUM'].map(lambda x: '{:.3f}'.format(x))

## save the result file
TCP_flt_data.reset_index(inplace=True)
TCP_flt_data.to_csv('result/TCP_score_viper/TCPS_merged_for_only_significant.txt',sep='\t', index=False)