In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
display(HTML("<style>.input_area pre {font-family: Consolas; font-size: 14pt; line-height: 140%;}</style>"))
display(HTML("<style>.output_area pre {font-family: Consolas; font-size: 14pt; line-height: 140%;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import os
import glob
import datetime
import shutil
from collections import defaultdict, Counter
from copy import deepcopy
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family=font_name)

# compounds in LINCS

In [3]:
LINCS2GDSC_cell={'A375':'A375',
                'A549':'A549',
                'BT20':'BT-20',
                'HELA.311':'HeLa',
                'HS578T':'Hs-578-T',
                'HT29':'HT-29',
                'JURKAT':'Jurkat',
                'LNCAP':'LNCaP-Clone-FGC',
                'MCF7':'MCF7',
                'MDAMB231':'MDA-MB-231',
                'PC3':'PC-3',
                'YAPC.311':'YAPC'}

GDSC2LINCS_cell={}
for key,val in LINCS2GDSC_cell.items():
    GDSC2LINCS_cell[val]=key
    
    
## get compound(pubchem cid) list of the selected cells (MCF7, A549, LNCAP, BT20, MDAMB231, A375, PC3) in LINCS database
def getCellCompDic_LINCS():
    cellComp_dic = {}
    in_file = open('data/LINCS/LINCS_cpdInfo.txt')
     
    for line in in_file:
        cell, comp = line.replace('\n','').replace('\r','').split('\t')
        cellComp_dic[cell] = comp.split('|')
         
    in_file.close()
    return cellComp_dic

def getLnsPCDic():
    # lsm to pc
    in_file = open('data/LINCS/LSM2Cid.txt')
    
    lsm2pc = {}
    pc2lsm = {}
    header = True
    for line in in_file:
        if header: header=False; continue
        
        if len(line.replace('\n','').split('\t')) != 6: continue
        
        (lsm, pc, smiles, inchi, inchikey, mass) = line.replace('\n','').split('\t')
        if (pc==''): continue
        
        lsm = lsm.strip(); pc = pc.strip()
        lsm2pc[lsm] = pc
        pc2lsm[pc]=lsm
    
    in_file.close()
    
    # brd to pc
    in_file = open('data/LINCS/BRD2LSM.txt')
    
    brd2pc = {}
    lsm2brd = {}
    header = True
    for line in in_file:
        if header: header=False; continue
        (lsm, brd) = line.replace('\n','').split('\t')
        lsm = lsm.strip()
        brd = brd.strip()
        if not brd.startswith('BRD'): continue
        
        lsm2brd[lsm] = brd
        if lsm in lsm2pc.keys():
            brd2pc[brd] = lsm2pc[lsm]
    
    in_file.close()
    
    pc2brd = {}
    for pc, lsm in pc2lsm.items():
        if lsm in lsm2brd.keys():
            pc2brd[pc] = lsm2brd[lsm]
            
    return brd2pc, pc2brd

# get pubchem id of LINCS compound
lnsCellComp_dic = getCellCompDic_LINCS()
brd2pc, pc2brd  = getLnsPCDic()

LINCS_24h_comp_list={}
for cellHr, comp_list in lnsCellComp_dic.items():
    cell, hr = cellHr.split("|")
    if (cell not in LINCS2GDSC_cell.keys()) or (hr!='24'):
        continue
        
    comp_set = set()
    for comp in comp_list:
        if comp in brd2pc.keys():
            comp_set.add(int(brd2pc[comp]))
    LINCS_24h_comp_list[cell] = list(comp_set)

for key,val in LINCS_24h_comp_list.items():
    print(key, len(val), len(set(val)), val[:5])

A375 264 264 [5374464, 16048642, 2051, 9826308, 9952773]
A549 264 264 [5374464, 16048642, 2051, 9826308, 9952773]
BT20 107 107 [5374464, 16048642, 46843906, 9826308, 73707530]
HS578T 107 107 [5374464, 16048642, 46843906, 9826308, 73707530]
HT29 264 264 [5374464, 16048642, 2051, 9826308, 9952773]
JURKAT 60 60 [5374464, 2051, 24788740, 9868037, 156422]
LNCAP 58 58 [5374464, 46843906, 52914946, 9826308, 25226117]
MCF7 264 264 [5374464, 16048642, 2051, 9826308, 9952773]
MDAMB231 107 107 [5374464, 16048642, 46843906, 9826308, 73707530]
PC3 264 264 [5374464, 16048642, 2051, 9826308, 9952773]


# IC50 of common_compounds_between_LINCS_and_GDSC

In [4]:
def is_common_compounds(GDSC_cid_list, LINCS_cid_list):
    for GDSC in GDSC_cid_list:
        for LINCS in LINCS_cid_list:
            if GDSC==LINCS:
                return True, GDSC
    
    return False, None

GDSC2CID_df=pd.read_table("data/GDSC/1_GDSC_Pubchem_compound.txt", sep='\t', engine='python')
GDSC2CID_list_ds=GDSC2CID_df.groupby('GDSC')['Pubchem'].apply(list)


GDSC_IC50_df = pd.read_table('data/GDSC/3_GDSC_IC50.txt', sep='\t')
GDSC_IC50_df['DRUG_NAME']=GDSC_IC50_df['DRUG_NAME'].map(lambda x: x.strip())
GDSC_IC50_df.head()
all_ic50_dic={}
for LINCS_cell in LINCS_24h_comp_list.keys():
    GDSC_cell=LINCS2GDSC_cell[LINCS_cell]
        
    GDSC_IC50_sub_df=GDSC_IC50_df.loc[GDSC_IC50_df['CELL_LINE_NAME']==GDSC_cell]
    
    ic50_dic={}
    for row in GDSC_IC50_sub_df.itertuples():
        if row.DRUG_NAME not in list(GDSC2CID_list_ds.index):
            continue
        flg, cid = is_common_compounds(GDSC2CID_list_ds[row.DRUG_NAME], LINCS_24h_comp_list[LINCS_cell])
        if flg:
            ic50_dic[cid]=round(np.e**float(row.LN_IC50),4)
    
    all_ic50_dic[LINCS_cell]=ic50_dic

all_ic50_df=pd.DataFrame(all_ic50_dic)
all_ic50_df.to_csv(path_or_buf='result/GDSC_IC50s_for_common_drugs.txt', sep='\t',index=True)

Unnamed: 0,CELL_LINE_NAME,DRUG_NAME,LN_IC50
0,MC-CAR,Erlotinib,2.453524
1,ES3,Erlotinib,3.376592
2,ES5,Erlotinib,3.614664
3,ES7,Erlotinib,3.223394
4,EW-11,Erlotinib,2.486405
