In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
display(HTML("<style>.input_area pre {font-family: Consolas; font-size: 16pt; line-height: 140%;}</style>"))
display(HTML("<style>.output_area pre {font-family: Consolas; font-size: 16pt; line-height: 140%;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import os
import glob
import datetime
import shutil
from collections import defaultdict, Counter
from copy import deepcopy
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family=font_name)

# 각 gene FC로 부터 TF activity 구하기 (brd2cid도 수행)

In [2]:
def get_BRD2CID_dic():
    # BRD to LSM mapping
    BRD2LSM_df=pd.read_table('./data/LINCS/BRD2LSM.txt',index_col='SM_Center_Canonical_ID', sep='\t',engine='python')
    BRD2LSM_df=BRD2LSM_df[BRD2LSM_df.index.str.startswith('BRD')]
    BRD2LSM_dic=BRD2LSM_df['SM_LINCS_ID'].to_dict()
    
    # LSM to CID mapping
    LSM2CID_df=pd.read_table('./data/LINCS/LSM2Cid.txt', index_col='SM_LINCS_ID', sep='\t',engine='python')
    LSM2CID_df=LSM2CID_df[~LSM2CID_df['SM_PubChem_CID'].isna()]
    LSM2CID_df['SM_PubChem_CID']=LSM2CID_df['SM_PubChem_CID'].astype(int)
    LSM2CID_dic=LSM2CID_df['SM_PubChem_CID'].to_dict()
    
    BRD2CID_dic={}
    for key,value in BRD2LSM_dic.items():
        if value not in LSM2CID_dic.keys():
            continue
        BRD2CID_dic[key]=LSM2CID_dic[value]
    
    return BRD2CID_dic

def get_TF_targets_directed_only(file_name, out_file_name, trust_dic):
    gene_FC_df=pd.read_table(file_name, sep='\t',engine='python',index_col=0)

    TF_activity_dic=defaultdict(int)
    regGeneNum=defaultdict(int)
    
    for (TF,typ),regGene_list in trust_dic.items():
        if typ not in ['Activation','Repression']:
            continue

        common_gene_list=list(set(regGene_list)&set(gene_FC_df.columns))
        
        ## drop TFs targeting no genes 
        if len(common_gene_list)==0:
            continue
            
        regGeneNum[TF]+=len(common_gene_list)
        
        gene_FC_for_TF_df=gene_FC_df[common_gene_list]
        each_TF_FC_ds=gene_FC_for_TF_df.apply(sum, axis=1)
        
        if typ=='Repression':
            each_TF_FC_ds=each_TF_FC_ds*(-1)

        TF_activity_dic[TF]+=each_TF_FC_ds
    
    TF_activity_df=pd.DataFrame(TF_activity_dic)
    
    for col in TF_activity_df.columns:
        TF_activity_df[col]=TF_activity_df[col]/regGeneNum[col]
        
    TF_activity_df.reset_index(inplace=True)
    comp_dose_ds=TF_activity_df['index'].map(lambda x: x.split('|'))
    TF_activity_df[['compound','dose']]=pd.DataFrame(comp_dose_ds.values.tolist(), index=TF_activity_df.index)
    TF_activity_df.drop(['index'], axis=1, inplace=True)
    TF_activity_df['compound']=TF_activity_df['compound'].map(BRD2CID)
    TF_activity_df.to_csv(out_file_name,sep='\t',index=False)

# main
BRD2CID=get_BRD2CID_dic()

trust_df=pd.read_table('data/TRRUST2_TF/trrust2_TF_target.tsv', sep='\t',engine='python', header=None, usecols=[0,1,2])
trust_df.columns=['TF','target','type']
trust_dic=trust_df.groupby(['TF','type'])['target'].apply(list).to_dict()

file_names=glob.glob('result/foldchange/*_all.txt')
for file_name in file_names:
    if 'JURKAT' in file_name:
        continue
    cell, mode = file_name.replace('.txt','').split('_')[-2:]
    print(cell,mode)
    out_file_name = 'result/TF_activity/comp_dose/TFA_CompDose_{}_{}.txt'.format(cell,mode)
    get_TF_targets_directed_only(file_name, out_file_name, trust_dic)

A375 all
A549 all
BT20 all
HS578T all
HT29 all
LNCAP all
MCF7 all
MDAMB231 all
PC3 all
