In [4]:
%reset
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import glob
import scipy.stats
import statsmodels.stats.multitest as multest
import statsmodels.sandbox.stats.multicomp as mulcomp

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


### compute TT (Therapeutic TF) scores by applying t-test to TF activities in effective and ineffective groups

In [5]:
def perform_ttest(file_name):
    cell=file_name.split('_')[-2]
    print('#### ', cell)
    
    anal_data=pd.read_table(file_name, sep='\t',engine='python')
    TT_score={}
    for col in anal_data.columns:
        if col=='effect':
            continue

        pos_set=anal_data.loc[anal_data['effect']==1,col]
        neg_set=anal_data.loc[anal_data['effect']==0,col]

        TTS, p_val=scipy.stats.ttest_ind(pos_set, neg_set, equal_var=False)

        TT_score[col]= [pos_set.mean(), neg_set.mean(), TTS, p_val]

    TT_score=pd.DataFrame(TT_score)
    TT_score=TT_score.T
    TT_score.columns=['TFAs Ave.(E)','TFAs Ave.(IE)', 'TTS','p_val']
    TT_score=TT_score.sort_values(by='TTS', ascending=False)
    TT_score['FDR'] = mulcomp.multipletests(TT_score['p_val'], method='bonferroni')[1]
    TT_score['negLog10_FDR'] = TT_score['FDR'].map(lambda x: -np.log10(x))
    
    return TT_score, cell
    
    
print('## viper')  
file_names=glob.glob('result/TF_activity_viper/*v.txt')

TT_score_merged = pd.DataFrame() ## for merging the results of two cells (only for viper)

for file_name in file_names:
    TT_score, cell=perform_ttest(file_name)
    TT_score.index.name='TF'
    TT_score.to_csv('result/TT_score_viper/TTS_{}.txt'.format(cell),sep='\t')
    
    ## represent data with short format
    for col in ['TFAs Ave.(E)','TFAs Ave.(IE)','TTS','negLog10_FDR']:
        TT_score[col] = TT_score[col].map(lambda x: '{:.3f}'.format(x))

    for col in ['p_val','FDR']:
        TT_score[col] = TT_score[col].map(lambda x: '{:.3e}'.format(x))
    
    ## change columns with cell name
    TT_score.columns=TT_score.columns+'_'+cell
    TT_score_merged=pd.concat([TT_score_merged, TT_score], axis=1, sort=True)
    
## calculate the sum of the both FDR and sort by the FDR sum
TT_score_merged.index.name='TF'
TT_score_merged['TTS_SUM']=TT_score_merged['TTS_A375'].astype(float)+TT_score_merged['TTS_HT29'].astype(float)
TT_score_merged.sort_values(by=['TTS_SUM'], ascending=False, inplace=True)
TT_score_merged['TTS_SUM']=TT_score_merged['TTS_SUM'].map(lambda x: '{:.3f}'.format(x))

## save the results
TT_score_merged.to_csv('result/TT_score_viper/TTS_merged.txt',sep='\t')

## viper
####  A375
####  HT29


### characterize significant TTs with a certain FDR

In [6]:
TTS_FDR=1.0e-05

TT_result=open('result/TT_score_viper/sig_TT.txt','w+')
pos_TT_both=[]
neg_TT_both=[]
for cell in ['A375','HT29']:
    TTS=pd.read_table('result/TT_score_viper/TTS_{}.txt'.format(cell), sep='\t',index_col=0,engine='python')
    
    pos_TT=list(set(TTS.loc[(TTS['FDR']<TTS_FDR)&(TTS['TTS']>0)].index))
    neg_TT=list(set(TTS.loc[(TTS['FDR']<TTS_FDR)&(TTS['TTS']<0)].index))
    
    _=TT_result.write('{}(pos)\t{}\t{}\n'.format(cell, len(pos_TT), ','.join(pos_TT)))
    _=TT_result.write('{}(neg)\t{}\t{}\n'.format(cell, len(neg_TT), ','.join(neg_TT)))
    
    pos_TT_both+=pos_TT
    neg_TT_both+=neg_TT

pos_TT_both=[TT for TT in set(pos_TT_both) if pos_TT_both.count(TT)==2]
neg_TT_both=[TT for TT in set(neg_TT_both) if neg_TT_both.count(TT)==2]

_=TT_result.write('both(pos)\t{}\t{}\n'.format(len(pos_TT_both), ','.join(pos_TT_both)))
_=TT_result.write('both(neg)\t{}\t{}\n'.format(len(neg_TT_both), ','.join(neg_TT_both)))

TT_result.close()