# 01 Data preparation

Processing xlsx files from data folder into suitable inputs and generate other input files

In [3]:
import pandas as pd
import numpy as np
import os
import pickle

In [4]:
data_dir = "/Users/lidiayung/Downloads/DATA"
info_dir = data_dir
out_dir = "/Users/lidiayung/Downloads/01_outputs_2020"

#os.makedirs(info_dir, exist_ok = True)
os.makedirs(out_dir, exist_ok = True)

## Modules

Load data about modules and drugs.

In [5]:
### DATA
### remove whitespaces in names (modules), remove duplicates

modules_df = pd.read_excel(
    os.path.join(info_dir, "ALL_DATA_2020_Jing.xlsx"), sheet_name = "modules", index_col = 0)

# display(modules_df)

selected_modules = modules_df.index.tolist()
print(len(selected_modules), ' - Size after reading')

# remove duplicates
selected_modules = modules_df.index.unique().tolist()
print(len(selected_modules), ' - Size after remove duplicates')

# remove whitespaces in modules' names 
selected_modules = [d.strip() for d in selected_modules]

print('Selected modules list: ', len(selected_modules), selected_modules)


11  - Size after reading
11  - Size after remove duplicates
Selected modules list:  11 ['CDK1', 'CDK2', 'CDK4_6', 'EGFR', 'PI3K', 'FGFR', 'mTOR', 'TOP2A', 'p53', 'Estrogen', 'Androgen']


In [7]:
### DATA
### remove whitespaces in names (modules, drugs), remove duplicates
### check the dimensions of the indicator IC50 1 uM = 1000 nM
### copy-paste as values, numbers, no formulas

IC50_df = pd.read_excel(
    os.path.join(info_dir, "ALL_DATA_2020_Jing.xlsx"), sheet_name = "IC50s")
IC50_df.drop(columns=['Unnamed: 3','Unnamed: 4'],inplace=True)

print(len(IC50_df.index), ' - Size after reading')
# display(IC50_df)

# rename
IC50_df = IC50_df.rename(columns = {"IC50, uM": "IC50"})

# manually correcting value IC50, Example  for IOX2 -> 30 nM  
#IC50_df.loc[IC50_df.index == 'IOX2', IC50_df.columns == 'IC50'] = 30/1000

# remove non-selected modules, modules' names with whitespaces or empty
IC50_df = IC50_df[IC50_df.Module.isin(selected_modules)]

print(len(IC50_df.index), ' - Size after remove modules')
# display(IC50_df)

# remove duplicates 
# Considering certain columns is optional. 
# Indexes, including time indexes are ignored.
IC50_df = IC50_df.drop_duplicates()

print(len(IC50_df.index), ' - Size after remove duplicates')
display(IC50_df)

37  - Size after reading
37  - Size after remove modules
37  - Size after remove duplicates


Unnamed: 0,Drug,Module,IC50
0,flufenamic-acid,Androgen,3.0
1,nandrolone,Androgen,9.0
2,oxandrolone,Androgen,190.3
3,testosterone-enanthate,Androgen,200.0
4,testosterone-propionate,Androgen,124.0
5,JNJ-7706621,CDK1,0.027
6,PHA-793887,CDK1,0.18
7,roscovitine,CDK2,2.0
8,alvocidib,CDK4_6,0.12
9,Palbociclib,CDK4_6,0.045


In [9]:
modules = IC50_df.Module.unique().tolist()

print('IC50_df  modules list: ', len(modules), modules)
print()
print('Selected modules list: ', len(selected_modules), selected_modules)

### CHECK
print()
print('CHECK: ', len(selected_modules),'=?', len(modules))

n_modules = len(modules)


IC50_df  modules list:  11 ['Androgen', 'CDK1', 'CDK2', 'CDK4_6', 'EGFR', 'Estrogen', 'FGFR', 'mTOR', 'p53', 'PI3K', 'TOP2A']

Selected modules list:  11 ['CDK1', 'CDK2', 'CDK4_6', 'EGFR', 'PI3K', 'FGFR', 'mTOR', 'TOP2A', 'p53', 'Estrogen', 'Androgen']

CHECK:  11 =? 11


In [22]:
drugs = IC50_df.Drug.tolist()
print(len(drugs), ' - Size after reading')

# remove duplicates
drugs = IC50_df.Drug.unique().tolist()
print(len(drugs), ' - Size after remove duplicates')

# remove whitespaces in drugs' names (necessary for some)
drugs = [d.strip() for d in drugs]

# remove duplicates after remove whitespaces
drugs = list(set(drugs))
print(len(drugs), ' - Size after remove duplicates without whitespaces')

print('Drugs list: ', len(drugs), drugs)

n_drugs = len(drugs)

37  - Size after reading
37  - Size after remove duplicates
37  - Size after remove duplicates without whitespaces
Drugs list:  37 ['RITA', 'daunorubicin', 'estradiol-cypionate', 'Nutlin-3', 'testosterone-enanthate', 'idarubicin', 'epirubicin', 'HLI373', 'gefitinib', 'vandetanib', 'raloxifene', 'ponatinib', 'sorafenib', 'GDC-0349', 'testosterone-propionate', 'masitinib', 'LY-294002', 'KU-0063794', 'mitoxantrone', 'flufenamic-acid', 'lapatinib', 'taselisib', 'afatinib', 'AMG-232', 'oxandrolone', 'alvocidib', 'dienestrol', 'PHA-793887', 'SAR405838', 'roscovitine', 'PI-103', 'nandrolone', 'NVP-BEZ235', 'JNJ-7706621', 'Palbociclib', 'erlotinib', 'AS-605240']


## L1000 meta data

Get sig_id for selected drugs.

In [19]:
sig_info_df = pd.read_csv(os.path.join(data_dir, "sig_info_2020_BC3C.csv"),sep=',',index_col = 0)

In [20]:
sig_info_df

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_BC3C_24H:A03,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A03,1,ctl_vehicle,DMSO,DMSO_No_target,0.0,0 uM,0.00
ASG002_BC3C_24H:A04,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A04,1,ctl_vehicle,DMSO,DMSO_No_target,0.0,0 uM,0.00
ASG002_BC3C_24H:A05,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A05,1,ctl_vehicle,DMSO,DMSO_No_target,0.0,0 uM,0.00
ASG002_BC3C_24H:A06,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A06,1,ctl_vehicle,DMSO,DMSO_No_target,0.0,0 uM,0.00
ASG002_BC3C_24H:J13,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:J13,1,ctl_vehicle,DMSO,DMSO_No_target,0.0,0 uM,0.00
...,...,...,...,...,...,...,...,...,...,...,...
MOAR012_BC3C_24H:P20,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P20,1,trt_cp,BAY-61-3606,,0.0,3.33 uM,3.33
MOAR012_BC3C_24H:P21,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P21,1,trt_cp,BAY-61-3606,,0.0,1.11 uM,1.11
MOAR012_BC3C_24H:P22,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P22,1,trt_cp,ethaverine,,0.0,10 uM,10.00
MOAR012_BC3C_24H:P23,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P23,1,trt_cp,ethaverine,,0.0,3.33 uM,3.33


sig_info_df = pd.read_csv(os.path.join(data_dir, "siginfo_beta.txt"),sep='\t', index_col = 25)
#sig_info_df_HUVEC = sig_info_df.loc[sig_info_df['cell_mfc_name'].str.contains('HUVEC')]
sig_info_df = sig_info_df.loc[sig_info_df['cell_mfc_name'].str.contains('THP1')]

#LFC_PLCg = pd.read_csv(os.path.join(data_dir, "PLCg_Data_log_2020.csv"),index_col=0)
#sig_info_df_PLCg = sig_info_df.loc[LFC_PLCg.index]
#sig_info_df = pd.concat([sig_info_df_HUVEC,sig_info_df_PLCg],axis=0)

display(sig_info_df)

In [23]:
# now filtering so only the required drugs are present
sig_info_df = sig_info_df.loc[sig_info_df.pert_drug.isin(drugs)]

# here's what we have now
display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_BC3C_24H:A10,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A10,1,trt_cp,taselisib,PIK3CA,1.0,10 uM,10.00
ASG002_BC3C_24H:A11,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A11,1,trt_cp,taselisib,PIK3CA,1.0,1.11 uM,1.11
ASG002_BC3C_24H:A19,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A19,1,trt_cp,AS-605240,PIK3CG,1.0,10 uM,10.00
ASG002_BC3C_24H:A20,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A20,1,trt_cp,AS-605240,PIK3CG,1.0,1.11 uM,1.11
ASG002_BC3C_24H:A21,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A21,1,trt_cp,AS-605240,PIK3CG,1.0,0.12 uM,0.12
...,...,...,...,...,...,...,...,...,...,...,...
MOAR011_BC3C_24H:C02,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:C02,1,trt_cp,oxandrolone,AR,1.0,3.33 uM,3.33
MOAR011_BC3C_24H:C03,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:C03,1,trt_cp,oxandrolone,AR,1.0,1.11 uM,1.11
MOAR011_BC3C_24H:C10,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:C10,1,trt_cp,testosterone-enanthate,AR,1.0,10 uM,10.00
MOAR011_BC3C_24H:C11,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:C11,1,trt_cp,testosterone-enanthate,AR,1.0,3.33 uM,3.33


In [24]:
exp_ids = sig_info_df.index.unique().tolist()
print('Experiments ids list: ', len(exp_ids))

n_experiments = len(exp_ids)

Experiments ids list:  98


Confirm data by checking the drugs of interest against the filtered L1000 meta data.

In [25]:
print(f"Number of drugs of interest:\t{len(drugs)}")
#print(f'Number of drugs in L1000 data:\t{len(sig_info_df.value_counts("drugs"))}')

#sig_info_df.value_counts("drugs")

Number of drugs of interest:	37


## L1000 data

In [11]:
LFC_df = pd.read_csv(os.path.join(data_dir, "level3_norm_thp1.csv"), index_col = 0)
display(LFC_df)

Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
AICHI001_THP1_24H:A03,-0.051469,0.068087,1.619786,0.197440,-0.092832,0.153559,-0.038607,-0.127486,-0.262796,-0.001191,...,0.145774,-0.699832,0.197302,-0.197423,0.170148,0.018956,-0.160991,0.160636,0.434180,-0.089075
AICHI001_THP1_24H:A04,0.246263,-0.002630,0.004236,0.201474,0.218100,0.132609,0.067376,0.083881,-0.212229,-0.285673,...,0.063340,0.223952,-0.145247,0.004727,0.238414,0.387924,0.082741,0.227103,0.365113,0.413791
AICHI001_THP1_24H:A05,0.151863,0.053370,-0.194680,0.324075,0.011184,0.095559,0.120693,0.076748,0.304621,-0.146906,...,0.121440,0.088602,-0.174197,-0.245941,-0.143568,0.195724,-0.084459,-0.009331,0.376179,-0.376908
AICHI001_THP1_24H:A06,0.395531,0.381554,-0.213914,0.484775,0.246634,0.167292,0.041659,0.219648,-0.042629,-0.450957,...,-0.114177,0.187351,0.178503,-0.025890,0.529898,0.159107,-0.012609,0.073436,0.267413,-0.367042
AICHI001_THP1_24H:A07,-1.160836,0.706071,-0.137480,0.157474,-0.599216,1.100126,-0.208540,0.081448,-0.165980,-0.827257,...,0.821839,0.503701,0.498320,0.071528,-0.117135,-0.043744,0.207525,0.075369,-0.168370,-0.665593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
REP.B028_THP1_24H:P23,-0.096095,0.140660,0.049450,0.002725,0.037976,0.128351,-0.053827,0.008464,0.231246,0.220580,...,0.007838,-0.461756,0.024425,0.164083,0.326541,-0.561545,-0.125503,0.228947,0.002581,0.125381
REP.B028_THP1_24H:P24,-0.007245,0.056210,0.040750,-0.164925,0.172701,-0.147549,-0.077528,0.158815,-0.563929,-0.173469,...,0.084538,0.323094,-0.170676,-0.308566,0.312516,-0.304895,-0.201453,0.060222,0.202931,-0.032320
ZTO.XPR001_THP1_408H:CMAP-000:-666,-0.076504,0.196899,0.026406,0.037738,0.163917,0.059711,-0.011489,-0.003306,-0.133241,-0.112347,...,0.170960,0.041612,0.040709,-0.096339,0.093352,-0.014189,-0.209928,0.029231,-0.114146,0.081037
ZTO.XPR001_THP1_408H:GC1:-666,0.037214,-0.062505,-0.061845,-0.149995,-0.120290,-0.078459,0.031846,0.302690,0.182437,0.132301,...,-0.028359,-0.019649,-0.099940,0.055280,-0.037169,-0.005694,0.295223,-0.083138,0.369763,-0.115631


In [12]:
LFC_df = LFC_df[LFC_df.index.isin(exp_ids)]

#LFC_df = pd.concat([LFC_df,LFC_PLCg],axis=0)

# arrange experiments in same order as in list
LFC_df["sort_col"] = LFC_df.index.map({val: i for i, val in enumerate(exp_ids)})
LFC_df = LFC_df.sort_values("sort_col")
LFC_df = LFC_df.drop("sort_col", axis = 1)

# transpose
LFC_df = LFC_df.T

display(LFC_df)

Unnamed: 0,CPC006_THP1_6H:C04,CPC006_THP1_6H:G06,CPC006_THP1_6H:K09,CPC006_THP1_6H:M21,EMU001_THP1_6H:B19,EMU001_THP1_3H:A22,EMU001_THP1_24H:B19,EMU001_THP1_6H:L19,EMU001_THP1_6H:L24,EMU001_THP1_24H:L24,...,AICHI001_THP1_4H:E19,AICHI001_THP1_4H:M15,AICHI002_THP1_24H:M09,AICHI001_THP1_24H:E22,AICHI001_THP1_24H:E23,AICHI001_THP1_4H:O14,AICHI001_THP1_24H:M18,AICHI001_THP1_4H:M13,AICHI001_THP1_4H:E24,AICHI001_THP1_4H:O17
AARS,-0.387213,-0.196513,0.390337,0.711488,-1.631878,-3.350361,0.227556,-1.951610,-2.608110,-0.094777,...,-1.022569,-1.559520,-3.993821,-0.240303,-0.219036,-2.087727,-0.396419,-0.756336,-1.233303,-1.700752
ABCB6,-0.326253,0.598648,0.330848,-0.324052,-1.532262,-1.169512,0.192888,-1.123496,-1.262929,-0.037912,...,-0.220063,-0.325463,-0.933889,-0.119696,-0.131829,-0.400963,-0.030896,-0.315596,-0.227971,-0.142863
ABCC5,-0.372873,-0.817998,-0.027023,1.977602,-0.509829,0.294921,-0.151395,0.176938,0.006621,1.474204,...,-0.033072,0.268361,3.964201,-0.108097,-0.084881,2.111461,-0.166647,0.194003,0.266411,0.218737
ABCF1,0.940842,-0.074857,0.891892,-0.028908,-1.682907,-2.556273,-0.051639,-1.539789,-1.756840,-0.179273,...,-0.479926,-0.972592,4.273963,-0.482709,0.043341,-0.244234,0.019807,-0.841084,-0.907142,-0.913893
ABCF3,-0.306452,0.088499,1.271148,-0.294476,-2.347471,-2.476405,0.325646,-1.496271,-2.029871,0.351997,...,-0.424225,-0.588116,-3.544892,-0.119166,0.394718,0.139276,0.180400,-0.487541,-0.821358,-0.838208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF395,2.670836,0.505236,-0.189213,2.495137,-0.965475,-2.726208,0.046509,-1.331926,-1.038859,-0.469308,...,-1.725468,0.093173,3.804340,0.042623,-0.630876,-1.090889,0.073440,-0.177293,-1.857335,-1.503760
ZNF451,1.408164,1.956289,0.094364,1.167063,0.434050,-0.673208,0.227517,0.620250,0.582883,0.515117,...,1.529267,0.564175,1.061953,-0.114176,-0.200108,0.543221,0.011675,0.509025,0.338492,0.360874
ZNF586,0.332435,0.718335,-8.212865,-1.286765,0.744662,2.191138,-0.044121,0.957162,1.875146,-0.279687,...,1.483469,0.871011,-0.005867,-0.129514,-0.117347,2.198319,-0.129881,0.874019,0.706469,1.449803
ZNF589,1.713703,-0.861997,-0.674397,1.298077,-2.484406,-2.410790,-0.058223,-1.923722,-2.212456,-0.109756,...,-2.818812,-1.760613,3.719834,-0.183787,-0.618920,-2.541187,-0.848538,-1.866837,-2.258170,-2.182595


In [13]:
# now we need to leave only siginfo data present in LFC df
sig_info_df = sig_info_df.loc[LFC_df.columns]
n_experiments = len(sig_info_df.index)
exp_ids = sig_info_df.index.unique().tolist()
print(n_experiments)
sig_info_df


263


Unnamed: 0,bead_batch,nearest_dose,pert_dose,pert_dose_unit,pert_idose,pert_itime,pert_time,pert_time_unit,cell_mfc_name,pert_mfc_id,...,cell_iname,det_wells,det_plates,distil_ids,build_name,project_code,cmap_name,is_exemplar_sig,is_ncs_sig,is_null_sig
CPC006_THP1_6H:C04,f2b4,10.00,10.000000,uM,10 uM,6 h,6.0,h,THP1,BRD-K69932463-001-03-1,...,THP1,C04,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO:C04,,CPC,AZD-8055,0,0.0,0.0
CPC006_THP1_6H:G06,f2b4,10.00,10.000000,uM,10 uM,6 h,6.0,h,THP1,BRD-K67566344-001-01-8,...,THP1,G06,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO|CPC006_THP1...,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO:G06|CPC006_...,,CPC,KU-0063794,1,1.0,0.0
CPC006_THP1_6H:K09,f2b4,10.00,10.000000,uM,10 uM,6 h,6.0,h,THP1,BRD-K51575138-001-01-7,...,THP1,K09,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO:K09,,CPC,TPCA-1,0,0.0,0.0
CPC006_THP1_6H:M21,f2b4,10.00,10.000000,uM,10 uM,6 h,6.0,h,THP1,BRD-A62025033-001-01-8,...,THP1,M21,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO|CPC006_THP1...,CPC006_THP1_6H_X1_F2B4_DUO52HI53LO:M21|CPC006_...,,CPC,temsirolimus,0,1.0,0.0
EMU001_THP1_6H:B19,b39,10.00,10.000000,uM,10 uM,6 h,6.0,h,THP1,BRD-A50998626,...,THP1,B19,EMU001_THP1_6H_X1_B39|EMU001_THP1_6H_X2_B39|EM...,EMU001_THP1_6H_X1_B39:B19|EMU001_THP1_6H_X2_B3...,,EMU,palomid-529,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AICHI001_THP1_4H:O14,b39,2.50,2.500000,uM,2.5 uM,4 h,4.0,h,THP1,BRD-K13566078,...,THP1,O14,AICHI001_THP1_4H_X1_B39|AICHI001_THP1_4H_X3_B39,AICHI001_THP1_4H_X1_B39:O14|AICHI001_THP1_4H_X...,,AICHI,BMS-345541,0,1.0,0.0
AICHI001_THP1_24H:M18,b32,0.01,0.009766,uM,0.01 uM,24 h,24.0,h,THP1,BRD-K05804044,...,THP1,M18,AICHI001_THP1_24H_X1_B32|AICHI001_THP1_24H_X2_...,AICHI001_THP1_24H_X1_B32:M18|AICHI001_THP1_24H...,,AICHI,AZ-628,0,1.0,0.0
AICHI001_THP1_4H:M13,b39,10.00,10.000000,uM,10 uM,4 h,4.0,h,THP1,BRD-K05804044,...,THP1,M13,AICHI001_THP1_4H_X1_B39|AICHI001_THP1_4H_X2_B3...,AICHI001_THP1_4H_X1_B39:M13|AICHI001_THP1_4H_X...,,AICHI,AZ-628,1,1.0,0.0
AICHI001_THP1_4H:E24,b39,0.01,0.009766,uM,0.01 uM,4 h,4.0,h,THP1,BRD-K53581288,...,THP1,E24,AICHI001_THP1_4H_X1_B39|AICHI001_THP1_4H_X2_B3...,AICHI001_THP1_4H_X1_B39:E24|AICHI001_THP1_4H_X...,,AICHI,baricitinib,0,1.0,0.0


In [14]:
genes = LFC_df.index.tolist()
print('Landmark genes list: ', len(genes), genes)

n_genes = len(genes)

Landmark genes list:  978 ['AARS', 'ABCB6', 'ABCC5', 'ABCF1', 'ABCF3', 'ABHD4', 'ABHD6', 'ABL1', 'ACAA1', 'ACAT2', 'ACBD3', 'ACD', 'ACLY', 'ACOT9', 'ADAM10', 'ADAT1', 'ADGRE5', 'ADGRG1', 'ADH5', 'ADI1', 'ADO', 'ADRB2', 'AGL', 'AKAP8', 'AKAP8L', 'AKR7A2', 'AKT1', 'ALAS1', 'ALDH7A1', 'ALDOA', 'ALDOC', 'AMDHD2', 'ANKRD10', 'ANO10', 'ANXA7', 'APBB2', 'APOE', 'APP', 'APPBP2', 'ARFIP2', 'ARHGAP1', 'ARHGEF12', 'ARHGEF2', 'ARID4B', 'ARID5B', 'ARL4C', 'ARNT2', 'ARPP19', 'ASAH1', 'ASCC3', 'ATF1', 'ATF5', 'ATF6', 'ATG3', 'ATMIN', 'ATP11B', 'ATP1B1', 'ATP2C1', 'ATP6V0B', 'ATP6V1D', 'AURKA', 'AURKB', 'AXIN1', 'B4GAT1', 'BACE2', 'BAD', 'BAG3', 'BAMBI', 'BAX', 'BCL2', 'BCL7B', 'BDH1', 'BECN1', 'BHLHE40', 'BID', 'BIRC2', 'BIRC5', 'BLCAP', 'BLMH', 'BLVRA', 'BMP4', 'BNIP3', 'BNIP3L', 'BPHL', 'BRCA1', 'BTK', 'BUB1B', 'BZW2', 'C2CD2', 'C2CD2L', 'C2CD5', 'C5', 'CAB39', 'CALM3', 'CALU', 'CAMSAP2', 'CANT1', 'CAPN1', 'CARMIL1', 'CASC3', 'CASK', 'CASP10', 'CASP2', 'CASP3', 'CASP7', 'CAST', 'CAT', 'CBLB', 'CBR1

## Inhibitor concentrations, IC50, and perturbation matrices

In [15]:
inhib_conc_matrix = np.zeros((n_modules, n_experiments))
ic50_matrix = np.ones((n_modules, n_experiments))
gamma_matrix = np.zeros((n_modules, n_experiments))

In [16]:

for i, module in enumerate(modules):
    drugs_for_module = IC50_df.Drug[IC50_df.Module == module].tolist()
    for drug in drugs_for_module:
        # get IC50 for this drug
        ic50 = IC50_df.IC50[IC50_df.Drug == drug].values
        gamma = IC50_df.Gamma[IC50_df.Drug == drug].values
        #print(drug, ic50)
        assert ic50.size == 1
        assert gamma.size == 1
        # get experiments with this drug
        exp_with_drug = sig_info_df.index[sig_info_df.cmap_name == drug].tolist()
        #print(exp_with_drug) 
        for exp_id in exp_with_drug:
            j = exp_ids.index(exp_id)
            #print(j)
            # extract inhibitor concentration
            inhib_conc = sig_info_df.pert_dose[sig_info_df.index == exp_id].values
            assert inhib_conc.size == 1
            # insert values in matrices
            inhib_conc_matrix[i, j] = inhib_conc.item()
            ic50_matrix[i, j] = ic50.item()
            gamma_matrix[i, j] = gamma.item()


In [17]:
# transform matrices into pandas dfs for export with row and column names
inhib_conc_df = pd.DataFrame(inhib_conc_matrix, index = modules, columns = exp_ids)
ic50_df = pd.DataFrame(ic50_matrix, index = modules, columns = exp_ids)
gamma_df = pd.DataFrame(gamma_matrix, index = modules, columns = exp_ids)

# create binary perturbation matrix
pert_df = pd.DataFrame(
    np.where(inhib_conc_matrix != 0, 1, 0),
    index = inhib_conc_df.index,
    columns = inhib_conc_df.columns,
)

In [18]:
display(ic50_df)
display(gamma_df)
display(inhib_conc_df)
display(pert_df)

Unnamed: 0,CPC006_THP1_6H:C04,CPC006_THP1_6H:G06,CPC006_THP1_6H:K09,CPC006_THP1_6H:M21,EMU001_THP1_6H:B19,EMU001_THP1_3H:A22,EMU001_THP1_24H:B19,EMU001_THP1_6H:L19,EMU001_THP1_6H:L24,EMU001_THP1_24H:L24,...,AICHI001_THP1_4H:E19,AICHI001_THP1_4H:M15,AICHI002_THP1_24H:M09,AICHI001_THP1_24H:E22,AICHI001_THP1_24H:E23,AICHI001_THP1_4H:O14,AICHI001_THP1_24H:M18,AICHI001_THP1_4H:M13,AICHI001_THP1_4H:E24,AICHI001_THP1_4H:O17
CDK1/2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.004,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDK4/6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
IKK,1.0,1.0,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
TLR4,1.0,1.0,1.0,1.0,1.0,0.002,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mTOR,0.02,0.03,1.0,1.76,0.05,1.0,0.05,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PAK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,3.0,1.0,1.0,1.0,1.0,3.0,3.0,1.0,1.0
JAK/STAT,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.07,0.07,0.07,...,0.05,1.0,1.0,0.05,0.05,1.0,1.0,1.0,0.05,1.0
PLK1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ITK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,CPC006_THP1_6H:C04,CPC006_THP1_6H:G06,CPC006_THP1_6H:K09,CPC006_THP1_6H:M21,EMU001_THP1_6H:B19,EMU001_THP1_3H:A22,EMU001_THP1_24H:B19,EMU001_THP1_6H:L19,EMU001_THP1_6H:L24,EMU001_THP1_24H:L24,...,AICHI001_THP1_4H:E19,AICHI001_THP1_4H:M15,AICHI002_THP1_24H:M09,AICHI001_THP1_24H:E22,AICHI001_THP1_24H:E23,AICHI001_THP1_4H:O14,AICHI001_THP1_24H:M18,AICHI001_THP1_4H:M13,AICHI001_THP1_4H:E24,AICHI001_THP1_4H:O17
CDK1/2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK4/6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IKK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TLR4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mTOR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PAK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JAK/STAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PLK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ITK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,CPC006_THP1_6H:C04,CPC006_THP1_6H:G06,CPC006_THP1_6H:K09,CPC006_THP1_6H:M21,EMU001_THP1_6H:B19,EMU001_THP1_3H:A22,EMU001_THP1_24H:B19,EMU001_THP1_6H:L19,EMU001_THP1_6H:L24,EMU001_THP1_24H:L24,...,AICHI001_THP1_4H:E19,AICHI001_THP1_4H:M15,AICHI002_THP1_24H:M09,AICHI001_THP1_24H:E22,AICHI001_THP1_24H:E23,AICHI001_THP1_4H:O14,AICHI001_THP1_24H:M18,AICHI001_THP1_4H:M13,AICHI001_THP1_4H:E24,AICHI001_THP1_4H:O17
CDK1/2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.625,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK4/6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IKK,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TLR4,0.0,0.0,0.0,0.0,0.0,0.15625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mTOR,10.0,10.0,0.0,10.0,10.0,0.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PAK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.625,0.0,0.0,0.0,0.0,0.009766,10.0,0.0,0.0
JAK/STAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.009766,0.009766,...,10.0,0.0,0.0,0.15625,0.039062,0.0,0.0,0.0,0.009766,0.0
PLK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ITK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,CPC006_THP1_6H:C04,CPC006_THP1_6H:G06,CPC006_THP1_6H:K09,CPC006_THP1_6H:M21,EMU001_THP1_6H:B19,EMU001_THP1_3H:A22,EMU001_THP1_24H:B19,EMU001_THP1_6H:L19,EMU001_THP1_6H:L24,EMU001_THP1_24H:L24,...,AICHI001_THP1_4H:E19,AICHI001_THP1_4H:M15,AICHI002_THP1_24H:M09,AICHI001_THP1_24H:E22,AICHI001_THP1_24H:E23,AICHI001_THP1_4H:O14,AICHI001_THP1_24H:M18,AICHI001_THP1_4H:M13,AICHI001_THP1_4H:E24,AICHI001_THP1_4H:O17
CDK1/2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
CDK4/6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
IKK,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TLR4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mTOR,1,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PAK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERK,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
JAK/STAT,0,0,0,0,0,0,0,1,1,1,...,1,0,0,1,1,0,0,0,1,0
PLK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ITK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Global responses for DPD modules

According to our discussion, $R$ for DPD vectors can not be calculated with the same formula as for pathway activities. Instead we are assuming:

\begin{equation}
R_{DPD},j = DPD = STV_{DPD} \cdot LFC_j
\end{equation}

In [19]:
# load STV data frame
STVs = pd.read_excel(os.path.join(info_dir, "ALL_DATA.xlsx"), sheet_name = "STVs", index_col = 0)
STV_df = pd.DataFrame(np.zeros((len(LFC_df.index),len(STVs.columns))),index=LFC_df.index,columns=STVs.columns)
STV_df.loc[STVs.index] = STVs

display(STV_df)


Unnamed: 0,norm_vec_resist,norm_vec_TB,norm_vec_load,norm_vec_damage,norm_vec_microa
AARS,0.003751,0.000000,0.000000,0.000000,0.000000
ABCB6,0.005285,0.000000,0.000000,0.000000,0.000000
ABCC5,-0.000536,0.012496,0.014916,0.005246,0.018935
ABCF1,-0.004149,0.000000,0.009495,-0.006757,-0.004911
ABCF3,-0.011733,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...
ZNF395,0.010331,0.000000,0.000000,0.000000,0.000000
ZNF451,0.002226,0.000000,0.000000,0.000000,0.000000
ZNF586,-0.011612,0.000000,0.000000,0.000000,0.000000
ZNF589,0.011885,0.000000,0.000000,0.000000,0.000000


In [20]:

# create empty DPD data frame
DPD_df = pd.DataFrame(
    np.zeros((len(LFC_df.columns), len(STVs.columns))),
    index=LFC_df.columns,
    columns=["DPD_resist","DPD_TB","DPD_load","DPD_damage","DPD_microa"],
)

# populate
for exp_id in DPD_df.index:
    for i in range(len(STV_df.columns)):
        DPD_df.loc[exp_id].iloc[i] = np.dot(LFC_df.T.loc[exp_id], STV_df.iloc[:, i])

display(DPD_df)

Unnamed: 0,DPD_resist,DPD_TB,DPD_load,DPD_damage,DPD_microa
CPC006_THP1_6H:C04,-0.769247,-0.252601,1.196721,-0.373718,1.183066
CPC006_THP1_6H:G06,-1.433162,-0.077167,3.227575,2.459033,3.694395
CPC006_THP1_6H:K09,-1.055396,1.000931,0.972759,1.125405,1.259931
CPC006_THP1_6H:M21,0.476039,-0.190516,0.805359,-1.533741,0.224698
EMU001_THP1_6H:B19,2.809095,-2.082674,-0.349816,-0.944147,1.536528
...,...,...,...,...,...
AICHI001_THP1_4H:O14,0.805766,-0.051584,-0.489611,0.369096,1.447355
AICHI001_THP1_24H:M18,0.139720,-0.207295,0.234453,-0.420837,0.300335
AICHI001_THP1_4H:M13,0.025901,0.052178,0.515420,0.403796,2.089234
AICHI001_THP1_4H:E24,0.485682,0.663713,0.494942,0.622713,2.062756


In [21]:
# transform to R global
R_global_DPD_df = DPD_df.T
display(R_global_DPD_df)

Unnamed: 0,CPC006_THP1_6H:C04,CPC006_THP1_6H:G06,CPC006_THP1_6H:K09,CPC006_THP1_6H:M21,EMU001_THP1_6H:B19,EMU001_THP1_3H:A22,EMU001_THP1_24H:B19,EMU001_THP1_6H:L19,EMU001_THP1_6H:L24,EMU001_THP1_24H:L24,...,AICHI001_THP1_4H:E19,AICHI001_THP1_4H:M15,AICHI002_THP1_24H:M09,AICHI001_THP1_24H:E22,AICHI001_THP1_24H:E23,AICHI001_THP1_4H:O14,AICHI001_THP1_24H:M18,AICHI001_THP1_4H:M13,AICHI001_THP1_4H:E24,AICHI001_THP1_4H:O17
DPD_resist,-0.769247,-1.433162,-1.055396,0.476039,2.809095,3.472484,-0.492566,1.343949,2.049479,-0.53757,...,0.449639,0.447949,-0.343226,-0.106273,0.326936,0.805766,0.13972,0.025901,0.485682,0.884213
DPD_TB,-0.252601,-0.077167,1.000931,-0.190516,-2.082674,-0.780699,-0.232271,-0.399831,-0.961312,-0.356245,...,0.197868,0.04323,-1.079648,-0.138204,-0.046571,-0.051584,-0.207295,0.052178,0.663713,0.011954
DPD_load,1.196721,3.227575,0.972759,0.805359,-0.349816,-0.435418,-0.552939,-0.752625,-0.333279,-0.07186,...,0.2953,0.571335,-0.758256,-0.498665,-0.358668,-0.489611,0.234453,0.51542,0.494942,-0.078714
DPD_damage,-0.373718,2.459033,1.125405,-1.533741,-0.944147,-0.238261,-0.947662,0.617739,0.361276,0.141554,...,-0.130563,0.616341,-2.427922,0.000183,-0.163283,0.369096,-0.420837,0.403796,0.622713,0.134261
DPD_microa,1.183066,3.694395,1.259931,0.224698,1.536528,1.295538,-0.649277,0.631197,1.206122,-0.251971,...,1.756929,2.279128,-1.15324,-0.216475,-0.372471,1.447355,0.300335,2.089234,2.062756,1.446358


## Save outputs

In [22]:
# save metadata as pickle
all_metadata = {
    "modules": modules,
    "n_modules": n_modules,
    "drugs": drugs,
    "n_drugs": n_drugs,
    "exp_ids": exp_ids,
    "n_experiments": n_experiments,
    "genes": genes,
    "n_genes": n_genes,
}

print(all_metadata)

with open(os.path.join(out_dir, "metadata.pickle"), "wb") as f:
    pickle.dump(all_metadata, f, protocol = pickle.HIGHEST_PROTOCOL)

{'modules': ['CDK1/2', 'CDK4/6', 'IKK', 'TLR4', 'mTOR', 'PAK', 'ERK', 'JAK/STAT', 'PLK1', 'ITK'], 'n_modules': 10, 'drugs': ['AZ-628', 'palbociclib', 'BMS-509744', 'PF-03758309', 'KU-0063794', 'CGP-60474', 'TAK-733', 'baricitinib', 'TPCA-1', 'RAF-265', 'AZD-8055', 'palomid-529', 'temsirolimus', 'PD-0325901', 'resatorvid', 'FR-180204', 'BMS-345541', 'dinaciclib', 'tofacitinib', 'volasertib'], 'n_drugs': 20, 'exp_ids': ['CPC006_THP1_6H:C04', 'CPC006_THP1_6H:G06', 'CPC006_THP1_6H:K09', 'CPC006_THP1_6H:M21', 'EMU001_THP1_6H:B19', 'EMU001_THP1_3H:A22', 'EMU001_THP1_24H:B19', 'EMU001_THP1_6H:L19', 'EMU001_THP1_6H:L24', 'EMU001_THP1_24H:L24', 'EMU001_THP1_6H:B24', 'EMU001_THP1_3H:A24', 'EMU001_THP1_24H:A23', 'EMU001_THP1_3H:B19', 'EMU001_THP1_3H:A19', 'EMU001_THP1_3H:B20', 'EMU001_THP1_6H:L23', 'EMU001_THP1_6H:B21', 'EMU001_THP1_3H:L22', 'EMU001_THP1_3H:L24', 'EMU001_THP1_3H:L20', 'EMU001_THP1_6H:A19', 'EMU001_THP1_6H:A21', 'EMU001_THP1_24H:B23', 'EMU001_THP1_24H:A24', 'EMU001_THP1_6H:L22', '

In [23]:
# save doses and perturbation matrix
inhib_conc_df.to_csv(os.path.join(out_dir, "inhib_conc_annotated.csv"))
ic50_df.to_csv(os.path.join(out_dir, "ic50_annotated.csv"))
gamma_df.to_csv(os.path.join(out_dir, "gamma_annotated.csv"))
pert_df.to_csv(os.path.join(out_dir, "pert_annotated.csv"))

In [24]:
# save log fold change L1000 data
LFC_df.to_csv(os.path.join(out_dir, "L1000_LFC_data.csv"))

In [25]:
# save R_global for DPDs
R_global_DPD_df.to_csv(os.path.join(out_dir, "R_global_DPDonly_annotated.csv"))