In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import os
import glob

### common compounds (BRD id) between LINCS and GDSC from 'GDSC_IC50s_for_common_drugs.txt'

In [3]:
# cid to LSM mapping
LSM2cid_df=pd.read_table('./data/LINCS/LSM2Cid.txt', sep='\t',engine='python')
LSM2cid_df=LSM2cid_df[~LSM2cid_df['SM_PubChem_CID'].isna()]
LSM2cid_df['SM_PubChem_CID']=LSM2cid_df['SM_PubChem_CID'].astype(int)
LSM2cid_df.set_index('SM_PubChem_CID', inplace=True)
cid2LSM_ds=LSM2cid_df['SM_LINCS_ID']
cid2LSM=cid2LSM_ds.to_dict()

# LSM to BRD mapping
LSM2BRD_df=pd.read_table('./data/LINCS/BRD2LSM.txt',index_col='SM_LINCS_ID', sep='\t',engine='python')
LSM2BRD_ds=LSM2BRD_df['SM_Center_Canonical_ID']
LSM2BRD_ds=LSM2BRD_ds[LSM2BRD_ds.str.startswith('BRD')]
LSM2BRD=LSM2BRD_ds.to_dict()

def get_LINCS_BRD_list(cell):
    ic50_df = pd.read_table('./result/GDSC_IC50s_for_common_drugs.txt', sep='\t',index_col=0, engine='python')
    ic50_ds=ic50_df[cell]
    ic50_ds=ic50_ds[~ic50_ds.isna()]
    comp_list=[]
    for cid in list(ic50_ds.index):
        comp_list.append(LSM2BRD[cid2LSM[cid]])
    comp_list.append('DMSO')
    return comp_list

# main
BRD_dic={}
for cell in ['A375','A549','BT20','HS578T','HT29','JURKAT','LNCAP','MCF7','MDAMB231','PC3']:
    BRD_list=get_LINCS_BRD_list(cell)
    BRD_dic[cell]=BRD_list

for ii, (key, val_list) in enumerate(BRD_dic.items()):
    print(key, len(val_list), val_list[-4:])
    

A375 78 ['BRD-K83189926', 'BRD-K12343256', 'BRD-K09951645', 'DMSO']
A549 36 ['BRD-K83189926', 'BRD-K12343256', 'BRD-K09951645', 'DMSO']
BT20 40 ['BRD-K57080016', 'BRD-K16189898', 'BRD-K12343256', 'DMSO']
HS578T 39 ['BRD-K57080016', 'BRD-K16189898', 'BRD-K12343256', 'DMSO']
HT29 78 ['BRD-K83189926', 'BRD-K12343256', 'BRD-K09951645', 'DMSO']
JURKAT 7 ['BRD-K99616396', 'BRD-K74065929', 'BRD-K49865102', 'DMSO']
LNCAP 30 ['BRD-K88573743', 'BRD-K90382497', 'BRD-K19687926', 'DMSO']
MCF7 77 ['BRD-K83189926', 'BRD-K12343256', 'BRD-K09951645', 'DMSO']
MDAMB231 40 ['BRD-K57080016', 'BRD-K16189898', 'BRD-K12343256', 'DMSO']
PC3 69 ['BRD-K83189926', 'BRD-K12343256', 'BRD-K09951645', 'DMSO']


### meta-data (3~15 lines) of LINCS level 3

In [4]:
# main
lns_file = open('E:/LINCS_data/Broad_LINCS_Level3_INF_mlr12k_n113012x22268_2015-12-31.gct')

lineCnt = 0
data_dList = []
for line in lns_file:
    lineCnt += 1
    if lineCnt <= 2: continue
    if lineCnt == 16: break

    line_list = line.replace('\n','').replace('\r','').split('\t')
    data_dList.append(line_list)

lns_file.close()

lns_df = pd.DataFrame(data_dList[1:],columns=data_dList[0]).set_index('id')
lns_df.iloc[:,1000:1010]

Unnamed: 0_level_0,A375-trt_cp-22526,A375-trt_cp-22527,A375-trt_cp-22528,A375-trt_cp-22529,A375-trt_cp-22530,A375-trt_cp-22531,A375-trt_cp-22532,A375-trt_cp-22533,A375-trt_cp-22534,A375-trt_cp-22535
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
cl_center_specific_id,A375,A375,A375,A375,A375,A375,A375,A375,A375,A375
rn_target_gene_id,-666,-666,-666,-666,-666,-666,-666,-666,-666,-666
sm_center_compound_id,BRD-K19220233-001-04-9,BRD-K19220233-001-04-9,BRD-K19220233-001-04-9,BRD-K19220233-001-04-9,BRD-K19220233-001-04-9,BRD-K19220233-001-04-9,BRD-K46056750-001-04-7,BRD-K46056750-001-04-7,BRD-K46056750-001-04-7,BRD-K46056750-001-04-7
sm_dose,10.0,3.33,1.11,0.37,0.12,0.04,10.0,3.33,1.11,0.37
sm_dose_unit,um,um,um,um,um,um,um,um,um,um
sm_lincs_id,BRD-K19220233,BRD-K19220233,BRD-K19220233,BRD-K19220233,BRD-K19220233,BRD-K19220233,BRD-K46056750,BRD-K46056750,BRD-K46056750,BRD-K46056750
sm_name,JNK-9L,JNK-9L,JNK-9L,JNK-9L,JNK-9L,JNK-9L,AZD-7762,AZD-7762,AZD-7762,AZD-7762
sm_pert_type,trt_cp,trt_cp,trt_cp,trt_cp,trt_cp,trt_cp,trt_cp,trt_cp,trt_cp,trt_cp
sm_time,24,24,24,24,24,24,24,24,24,24
sm_time_unit,h,h,h,h,h,h,h,h,h,h


### LINCS experiment IDs of vehicle (DSMO) and the common compound (with GDSC) for each cell

In [6]:
# main
lns_compExp_dic={}
for cell in BRD_dic.keys():
    lns_compExp_list=[]
    for col in list(lns_df):
        lns_cell = lns_df.loc['cl_center_specific_id', col]
        if lns_cell!=cell:
            continue
            
        hr = lns_df.loc['sm_time', col]
        if hr!='24':
            continue
            
        comp = lns_df.loc['sm_lincs_id', col]
        if comp.upper() == 'DMSO': comp = 'DMSO'
        if comp not in BRD_dic[cell]:
            continue
        
        lns_compExp_list.append(col)
            
    lns_compExp_dic[cell]=lns_compExp_list

for ii, (key, val_list) in enumerate(lns_compExp_dic.items()):
    print(key, len(val_list), val_list[-3:])

A375 1831 ['A375-trt_poscon-68908', 'A375-trt_poscon-69279', 'A375-trt_poscon-69655']
A549 1763 ['A549-trt_poscon-71114', 'A549-trt_poscon-71486', 'A549-trt_poscon-71859']
BT20 935 ['BT20-trt_cp-3605', 'BT20-trt_cp-3606', 'BT20-trt_cp-3607']
HS578T 807 ['HS578T-trt_cp-9516', 'HS578T-trt_cp-9517', 'HS578T-trt_cp-9518']
HT29 1833 ['HT29-trt_poscon-75606', 'HT29-trt_poscon-75984', 'HT29-trt_poscon-76354']
JURKAT 106 ['JURKAT-trt_cp-95188', 'JURKAT-trt_poscon-94504', 'JURKAT-trt_poscon-94865']
LNCAP 548 ['LNCAP-trt_cp-37017', 'LNCAP-trt_cp-37018', 'LNCAP-trt_cp-37019']
MCF7 2082 ['MCF7-trt_poscon-76728', 'MCF7-trt_poscon-77092', 'MCF7-trt_poscon-77471']
MDAMB231 1181 ['MDAMB231-trt_cp-44318', 'MDAMB231-trt_cp-44319', 'MDAMB231-trt_cp-44320']
PC3 1642 ['PC3-trt_cp-78978', 'PC3-trt_cp-78979', 'PC3-trt_cp-78980']


### expression profiles of the collected LINCS experiments IDs

In [7]:
def getColIndex(line_list, colName_list):
    colIndex_list = []
    
    for colName in colName_list:
        try:
            colIndex_list.append(line_list.index(colName))
        except:
            print(colName)
            raise ValueError ('No exist column names')
            
    return colIndex_list

def getCols(fileName, colNameLine, colName_list):
    in_file = open(fileName)
    
    col_dList = []
    
    lineCnt = 0
    data_flg = False
    for line in in_file:
        lineCnt += 1
        line_list = line.replace('\n','').split('\t')
        
        if lineCnt == colNameLine:
            colIndex_list = getColIndex(line_list, colName_list)
            data_flg = True
            
        if data_flg:
            col_dList.append([line_list[i] for i in colIndex_list])
        
        if lineCnt%10000==0:
            print("{} completed".format(lineCnt))
    in_file.close()
    
    return col_dList

# main
for cell, lns_compExp_list in lns_compExp_dic.items():
    print("##", cell)
    dList = getCols('E:/LINCS_data/Broad_LINCS_Level3_INF_mlr12k_n113012x22268_2015-12-31.gct',3,['id','pr_gene_id','pr_gene_symbol'] + lns_compExp_list)

    lnsFil_file = open('result/LINCS_expression/LINCS_expression_{}.txt'.format(cell),'w+')
    for sList in dList:
        _dum=lnsFil_file.write('\t'.join(sList)+'\n')

    lnsFil_file.close()

## A375
10000 completed
20000 completed
## A549
10000 completed
20000 completed
## BT20
10000 completed
20000 completed
## HS578T
10000 completed
20000 completed
## HT29
10000 completed
20000 completed
## JURKAT
10000 completed
20000 completed
## LNCAP
10000 completed
20000 completed
## MCF7
10000 completed
20000 completed
## MDAMB231
10000 completed
20000 completed
## PC3
10000 completed
20000 completed


### fold change per each compound with particular dose

In [8]:
def quantileNormalize(df_input):
    #another algorithm
    #rank_mean = subLns_df.stack().groupby(subLns_df.rank(method='first').stack().astype(int)).mean()
    #qtSubLns_df = subLns_df.rank(method='min').stack().astype(int).map(rank_mean).unstack()

    df = df_input.copy()
    
    #compute rank
    dic = {}
    for col in df:
        dic.update({col : sorted(df[col])})
    sorted_df = pd.DataFrame(dic)
    rank = sorted_df.mean(axis = 1).tolist()
    
    #sort
    for col in df:
        t = np.searchsorted(np.sort(df[col]), df[col])
        df[col] = [rank[i] for i in t]
    return df

## main
for cell in lns_compExp_dic.keys():
    in_file='result/LINCS_expression/LINCS_expression_{}.txt'.format(cell)
    print(cell)
    
    # read expression data in LINCS per cell
    lns_df = pd.read_table(in_file, sep='\t', index_col = 'id', dtype = 'str')

    # get distinct plate name
    pt_list = [pt for pt in list(set(lns_df.loc['det_plate'])) if pt != '-666']

    all_fold_change_df=pd.DataFrame()
    for pt in pt_list:
        # get expression and head data
        lnsDatHead_df = lns_df.loc[:,lns_df.loc['det_plate']==pt]

        # get only head data
        lnsHead_df = lnsDatHead_df.loc[['sm_lincs_id','sm_dose']]
        
        # get only expression data
        lnsDat_df = lnsDatHead_df.iloc[12:]
        
        # convert expression data type to numeric
        lnsDat_df = lnsDat_df.astype(float)
        
        # quantileNormalization
        lnsDat_df = quantileNormalize(lnsDat_df)

        # attach gene info
        lnsG_df = lns_df.iloc[12:,[1]]
        lnsGdat_df = pd.concat([lnsG_df, lnsDat_df], axis=1)
        
        # get expression average per gene
        gene_expr_df = lnsGdat_df.groupby('pr_gene_symbol').mean()
        gene_expr_df.drop('-666', inplace=True)
        
        gene_expr_T_df=gene_expr_df.T
        lnsHead_T_df=lnsHead_df.T
        gene_expr_T_df = pd.concat([lnsHead_T_df,gene_expr_T_df],axis=1)
        
        # get mean expression per each combination of compound and dose
        ave_gene_expr_df=gene_expr_T_df.groupby(['sm_lincs_id','sm_dose']).mean()
        
        # log2(compound expression) - log2(vehicle expression)
        # -> log2 is NOT applied becasue the provided expression data are already log2 scale (http://www.lincscloud.org/how-the-data-were-prepared/)
        
        ## to avoid log2(0)
        #ave_gene_expr_df=ave_gene_expr_df+0.0001
        ## log2 scale
        #ave_gene_expr_df=np.log2(ave_gene_expr_df)
        
        fold_change_df=ave_gene_expr_df-ave_gene_expr_df.loc[('DMSO','-666')]
        fold_change_df.drop(('DMSO','-666'),axis=0,inplace=True)
        all_fold_change_df=pd.concat([all_fold_change_df,fold_change_df])
    
    if cell=='A375':
        all_fold_change_df.head()

    all_fold_change_df.reset_index(inplace=True)
    
    # final fold chanage profiles that were obtained by averaging in terms of compound and dose across all plates
    all_mean_fold_change_df=all_fold_change_df.groupby(['sm_lincs_id','sm_dose']).mean() 

    all_mean_fold_change_df.reset_index(inplace=True)

    all_mean_fold_change_df['sm_info']=all_mean_fold_change_df['sm_lincs_id']+'|'+all_mean_fold_change_df['sm_dose']
    all_mean_fold_change_df.drop(['sm_lincs_id','sm_dose'], axis=1, inplace=True)

    all_mean_fold_change_df.set_index('sm_info',inplace=True)
    all_mean_fold_change_df.index.name=None

    all_mean_fold_change_df.to_csv('result/foldchange/FC_{}_all.txt'.format(cell),sep='\t')

A375


Unnamed: 0_level_0,Unnamed: 1_level_0,A1CF,A2M,A4GALT,A4GNT,AAAS,AACS,AADAC,AAGAB,AAK1,AAMP,...,ZSCAN5A,ZSWIM1,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYX,ZZEF1,ZZZ3
sm_lincs_id,sm_dose,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BRD-A60245366,0.04,0.256851,2.448213,0.126862,0.290282,0.328587,0.246914,0.651502,-0.235247,0.172556,-0.094543,...,0.355457,-0.217624,0.272229,-0.404283,0.237238,0.118999,0.383996,0.444382,-0.212638,0.3656
BRD-A60245366,0.12,0.031122,-0.088582,0.097001,-0.034754,0.127819,1.047743,0.177853,0.149662,0.139014,0.219648,...,-0.070354,-0.219774,0.219788,0.389826,-0.830792,-0.132041,0.279104,0.342082,0.145058,0.048947
BRD-A60245366,0.37,-0.923273,0.851084,0.148982,-0.204872,0.304941,0.941665,-0.683736,-0.366955,0.19614,-0.016389,...,0.958951,-0.322851,0.134843,0.474036,-0.324154,0.128582,0.626927,-0.33801,-0.150725,0.316267
BRD-A60245366,1.11,-0.829112,0.558355,-0.135148,0.16876,-0.161189,-0.223362,-0.353083,0.275557,0.182784,0.002068,...,0.241552,-0.285687,0.106145,-0.089916,-0.390092,0.386865,-0.029518,0.273482,0.384518,0.563967
BRD-A60245366,3.33,-0.187211,1.816786,0.839205,0.701664,0.275922,-0.115141,-0.298756,0.478654,0.426614,0.661217,...,-1.139485,-0.361136,-0.504903,-2.314603,-0.121616,0.094482,0.090648,0.317982,0.487675,-0.246463


A549
BT20
HS578T
HT29
JURKAT
LNCAP
MCF7
MDAMB231
PC3
