In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import os
import glob

### - attach ic50 info to each FC induced by a certain compound experiment 
### - select cells to be analyzed with two criteria

In [2]:
def get_BRD2CID_dic():
    # BRD to LSM mapping
    BRD2LSM_df=pd.read_table('./data/LINCS/BRD2LSM.txt',index_col='SM_Center_Canonical_ID', sep='\t',engine='python')
    BRD2LSM_df=BRD2LSM_df[BRD2LSM_df.index.str.startswith('BRD')]
    BRD2LSM_dic=BRD2LSM_df['SM_LINCS_ID'].to_dict()
    
    # LSM to CID mapping
    LSM2CID_df=pd.read_table('./data/LINCS/LSM2Cid.txt', index_col='SM_LINCS_ID', sep='\t',engine='python')
    LSM2CID_df=LSM2CID_df[~LSM2CID_df['SM_PubChem_CID'].isna()]
    LSM2CID_df['SM_PubChem_CID']=LSM2CID_df['SM_PubChem_CID'].astype(int)
    LSM2CID_dic=LSM2CID_df['SM_PubChem_CID'].to_dict()
    
    BRD2CID_dic={}
    for key,value in BRD2LSM_dic.items():
        if value not in LSM2CID_dic.keys():
            continue
        BRD2CID_dic[key]=LSM2CID_dic[value]
    
    return BRD2CID_dic

def add_ic50_and_col_norm(file_name, BRD2CID):
    # fold change file (index: compound|dose)
    FC_df=pd.read_table(file_name, sep='\t',engine='python',index_col=0)
    
    # compound and dose info from index
    comp_dose_ds=FC_df.index.map(lambda x: x.split('|'))
    FC_df[['compound','dose']]=pd.DataFrame(comp_dose_ds.values.tolist(), index=FC_df.index)
    FC_df=FC_df.astype({'dose':np.float})
    FC_df.index=range(len(FC_df))
    
    # ic50s of the common compounds for a particular cell
    cell = file_name.split('_')[-2]
    ic50_df=pd.read_table('./result/GDSC_IC50s_for_common_drugs.txt', index_col=0, sep='\t',engine='python')
    ic50_ds=ic50_df[cell]
    ic50_ds=ic50_ds[~ic50_ds.isna()]
    ic50_dic=ic50_ds.to_dict()
    
    # replace compound to ic50 for a particular cell
    FC_df['compound']=FC_df['compound'].map(BRD2CID).map(ic50_dic)
    FC_df.rename(columns={'compound':'ic50'},inplace=True)    
    FC_df=FC_df[~FC_df['ic50'].isna()]
    
    # get effect info from ic50 and the treated dose
    diff_ds=FC_df['dose']-FC_df['ic50']
    FC_df['effect']=np.where(diff_ds>=0, 1, 0)
    FC_df.drop(['ic50','dose'],axis=1,inplace=True)
   
    # cell selection criteria 1 & 2
    tot=len(FC_df['effect'])
    eff=np.sum(FC_df['effect']==1)
    if (tot>400) and ((eff/tot)>0.15):
        out_file_name=file_name.replace("foldchange", "foldchange_with_effect").replace("all",'effect')
        FC_df.to_csv(out_file_name,sep='\t',index=False)
    
    print('{},{},{},{:.3f}'.format(cell, eff, tot, eff/tot))
        
BRD2CID=get_BRD2CID_dic()
file_names=glob.glob('result/foldchange/*.txt')
for file_name in file_names:
    add_ic50_and_col_norm(file_name, BRD2CID)

A375,119,456,0.261
A549,55,210,0.262
BT20,30,228,0.132
HS578T,37,221,0.167
HT29,78,456,0.171
JURKAT,4,31,0.129
LNCAP,33,167,0.198
MCF7,48,450,0.107
MDAMB231,32,228,0.140
PC3,49,402,0.122
