# 01 Data preparation

Processing xlsx files from data folder into suitable inputs and generate other input files

In [25]:
import pandas as pd
import numpy as np
import os
import pickle

In [26]:
cell_line ='BC3C'
data_dir = f"/home/jing/Phd_project/project_UCD_blca/blca_publication_OUTPUT/blca_publication_OUTPUT_LINCS/00_outputs_2020_{cell_line}/"
info_dir = data_dir
out_dir = f"/home/jing/Phd_project/project_UCD_blca/blca_publication_OUTPUT/blca_publication_OUTPUT_bmra/blca_publication_OUTPUT_bmra_{cell_line}/00_outputs_2020_{cell_line}"

#os.makedirs(info_dir, exist_ok = True)
os.makedirs(out_dir, exist_ok = True)

## Modules

Load data about modules and drugs.

In [27]:
### DATA
### remove whitespaces in names (modules), remove duplicates

modules_df = pd.read_excel(
    os.path.join(info_dir, "ALL_DATA_2020_Jing.xlsx"), sheet_name = "modules", index_col = 0)
# display(modules_df)

selected_modules = modules_df.index.tolist()
print(len(selected_modules), ' - Size after reading')

# remove duplicates
selected_modules = modules_df.index.unique().tolist()
print(len(selected_modules), ' - Size after remove duplicates')

# remove whitespaces in modules' names 
selected_modules = [d.strip() for d in selected_modules]

print('Selected modules list: ', len(selected_modules), selected_modules)


10  - Size after reading
10  - Size after remove duplicates
Selected modules list:  10 ['CDK1', 'CDK2', 'CDK4_6', 'EGFR', 'PI3K', 'FGFR', 'TOP2A', 'p53', 'Estrogen', 'Androgen']


In [28]:
### DATA
### remove whitespaces in names (modules, drugs), remove duplicates
### check the dimensions of the indicator IC50 1 uM = 1000 nM
### copy-paste as values, numbers, no formulas

IC50_df = pd.read_excel(
    os.path.join(info_dir, "ALL_DATA_2020_Jing.xlsx"), sheet_name = "IC50s")
IC50_df.drop(columns=['Unnamed: 3','Unnamed: 4'],inplace=True)

print(len(IC50_df.index), ' - Size after reading')
# display(IC50_df)

# rename
IC50_df = IC50_df.rename(columns = {"IC50, uM": "IC50"})

# manually correcting value IC50, Example  for IOX2 -> 30 nM  
#IC50_df.loc[IC50_df.index == 'IOX2', IC50_df.columns == 'IC50'] = 30/1000

# remove non-selected modules, modules' names with whitespaces or empty
IC50_df = IC50_df[IC50_df.Module.isin(selected_modules)]

print(len(IC50_df.index), ' - Size after remove modules')
# display(IC50_df)

# remove duplicates 
# Considering certain columns is optional. 
# Indexes, including time indexes are ignored.
IC50_df = IC50_df.drop_duplicates()

print(len(IC50_df.index), ' - Size after remove duplicates')
display(IC50_df)

39  - Size after reading
39  - Size after remove modules
39  - Size after remove duplicates


Unnamed: 0,Drug,Module,IC50
0,flufenamic-acid,Androgen,3.0
1,nandrolone,Androgen,9.0
2,oxandrolone,Androgen,190.3
3,testosterone-enanthate,Androgen,200.0
4,testosterone-propionate,Androgen,124.0
5,JNJ-7706621,CDK1,0.027
6,PHA-793887,CDK1,0.18
7,roscovitine,CDK2,2.0
8,alvocidib,CDK4_6,0.12
9,palbociclib,CDK4_6,0.045


In [29]:
modules = IC50_df.Module.unique().tolist()

print('IC50_df  modules list: ', len(modules), modules)
print()
print('Selected modules list: ', len(selected_modules), selected_modules)

### CHECK
print()
print('CHECK: ', len(selected_modules),'=?', len(modules))

n_modules = len(modules)


IC50_df  modules list:  10 ['Androgen', 'CDK1', 'CDK2', 'CDK4_6', 'EGFR', 'Estrogen', 'FGFR', 'PI3K', 'p53', 'TOP2A']

Selected modules list:  10 ['CDK1', 'CDK2', 'CDK4_6', 'EGFR', 'PI3K', 'FGFR', 'TOP2A', 'p53', 'Estrogen', 'Androgen']

CHECK:  10 =? 10


In [30]:
drugs = IC50_df.Drug.tolist()
print(len(drugs), ' - Size after reading')

# remove duplicates
drugs = IC50_df.Drug.unique().tolist()
print(len(drugs), ' - Size after remove duplicates')

# remove whitespaces in drugs' names (necessary for some)
drugs = [d.strip() for d in drugs]

# remove duplicates after remove whitespaces
drugs = list(set(drugs))
print(len(drugs), ' - Size after remove duplicates without whitespaces')

print('Drugs list: ', len(drugs), drugs)

n_drugs = len(drugs)

39  - Size after reading
39  - Size after remove duplicates
39  - Size after remove duplicates without whitespaces
Drugs list:  39 ['NVP-BEZ235', 'daunorubicin', 'nandrolone', 'nutlin-3', 'oxandrolone', 'RITA', 'PHA-793887', 'serdemetan', 'idarubicin', 'raloxifene', 'mitoxantrone', 'KU-0063794', 'LY-294002', 'testosterone-enanthate', 'flufenamic-acid', 'testosterone-propionate', 'SAR405838', 'lapatinib', 'masitinib', 'sorafenib', 'PI-103', 'HLI-373', 'alvocidib', 'gefitinib', 'dienestrol', 'JNJ-7706621', 'roscovitine', 'GDC-0349', 'palbociclib', 'AMG-232', 'erlotinib', 'ponatinib', 'AZD-8055', 'afatinib', 'vandetanib', 'taselisib', 'estradiol-cypionate', 'epirubicin', 'AS-605240']


## L1000 meta data

Get sig_id for selected drugs.

In [31]:
sig_info_df = pd.read_excel(os.path.join(data_dir, f"sig_info_2020_{cell_line}.xlsx"), index_col = 0)

display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_BC3C_24H:A03,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A03,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_BC3C_24H:A04,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A04,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_BC3C_24H:A05,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A05,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_BC3C_24H:A06,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A06,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_BC3C_24H:J13,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:J13,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
...,...,...,...,...,...,...,...,...,...,...,...
MOAR012_BC3C_24H:P20,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P20,1,trt_cp,BAY-61-3606,,0,3.33 uM,3.33
MOAR012_BC3C_24H:P21,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P21,1,trt_cp,BAY-61-3606,,0,1.11 uM,1.11
MOAR012_BC3C_24H:P22,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P22,1,trt_cp,ethaverine,,0,10 uM,10.00
MOAR012_BC3C_24H:P23,BC3C,MOAR012,24 h,MOAR012_BC3C_24H_X1_B36:P23,1,trt_cp,ethaverine,,0,3.33 uM,3.33


In [32]:
# now filtering so only the required drugs are present
sig_info_df = sig_info_df.loc[sig_info_df.pert_drug.isin(drugs)]

# here's what we have now
display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_BC3C_24H:A10,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A10,1,trt_cp,taselisib,PIK3CA,1,10 uM,10.00
ASG002_BC3C_24H:A11,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A11,1,trt_cp,taselisib,PIK3CA,1,1.11 uM,1.11
ASG002_BC3C_24H:A19,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A19,1,trt_cp,AS-605240,PIK3CG,1,10 uM,10.00
ASG002_BC3C_24H:A20,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A20,1,trt_cp,AS-605240,PIK3CG,1,1.11 uM,1.11
ASG002_BC3C_24H:A21,BC3C,ASG002,24 h,ASG002_BC3C_24H_X1_B35:A21,1,trt_cp,AS-605240,PIK3CG,1,0.12 uM,0.12
...,...,...,...,...,...,...,...,...,...,...,...
MOAR011_BC3C_24H:C11,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:C11,1,trt_cp,testosterone-enanthate,AR,1,3.33 uM,3.33
MOAR011_BC3C_24H:F07,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:F07,1,trt_cp,serdemetan,MDM2,1,10 uM,10.00
MOAR011_BC3C_24H:F08,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:F08,1,trt_cp,serdemetan,MDM2,1,3.33 uM,3.33
MOAR011_BC3C_24H:F09,BC3C,MOAR011,24 h,MOAR011_BC3C_24H_X1_B36:F09,1,trt_cp,serdemetan,MDM2,1,1.11 uM,1.11


In [33]:
rows_remove = ['ASG002_BC3C_24H:F04', 'ASG002_BC3C_24H:F05','ASG002_BC3C_24H:L02',
               'ASG002_BC3C_24H:O24','ASG002_BC3C_24H:L07','ASG002_BC3C_24H:L09',
               'MOAR010_BC3C_24H:D02', 'ASG002_BC3C_24H:F01','MOAR010_BC3C_24H:D01',
               'ASG002_BC3C_24H:N19','ASG002_BC3C_24H:N21','ASG002_BC3C_24H:N24',
               'ASG002_BC3C_24H:I19','ASG002_BC3C_24H:I21','ASG002_BC3C_24H:L17',
               'ASG002_BC3C_24H:M23','MOAR008_BC3C_24H:L03','MOAR010_BC3C_24H:L20','MOAR011_BC3C_24H:J10',
               'MOAR008_BC3C_24H:L08','MOAR009_BC3C_24H:C10','MOAR010_BC3C_24H:A13','MOAR010_BC3C_24H:A14','MOAR011_BC3C_24H:F09',
               'ASG002_BC3C_24H:G01','ASG002_BC3C_24H:P20']

Manually remove  few inhibition from data set, since it does differ from the other data points.

inhib_to_filter = "PF-03758309"
dose_to_filter = 10

id_to_filter = sig_info_df[
    np.logical_and(
        sig_info_df.pert_drug == inhib_to_filter,
        sig_info_df.dose_float == dose_to_filter,
    )
].index.values

print(id_to_filter)

sig_info_df = sig_info_df[~sig_info_df.index.isin(id_to_filter)]
display(sig_info_df)

inhib_to_filter = "roscovitine"
dose_to_filter = 3.33

id_to_filter = sig_info_df[
    np.logical_and(
        sig_info_df.pert_drug == inhib_to_filter,
        sig_info_df.dose_float == dose_to_filter,
    )
].index.values

print(id_to_filter)

sig_info_df = sig_info_df[~sig_info_df.index.isin(id_to_filter)]
display(sig_info_df)

In [34]:
sig_info_df.drop(rows_remove,inplace=True)

In [35]:
exp_ids = sig_info_df.index.unique().tolist()
print('Experiments ids list: ', len(exp_ids))

n_experiments = len(exp_ids)

Experiments ids list:  90


Confirm data by checking the drugs of interest against the filtered L1000 meta data.

In [36]:
print(f"Number of drugs of interest:\t{len(drugs)}")
#print(f'Number of drugs in L1000 data:\t{len(sig_info_df.value_counts("drugs"))}')

#sig_info_df.value_counts("drugs")

Number of drugs of interest:	39


## L1000 data

In [37]:
Data_norm_df = pd.read_excel(os.path.join(data_dir, f"Data_norm_2020_{cell_line}.xlsx"), index_col = 0)
display(Data_norm_df)

Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
ASG002_BC3C_24H:A03,-0.191254,-0.055246,0.039596,-0.256266,-0.040419,-0.590523,-0.159396,-0.074319,0.457981,0.409608,...,0.543203,0.494266,-0.011923,-0.225931,0.285054,-0.775246,0.166031,-0.024873,0.238723,0.284204
ASG002_BC3C_24H:A04,-0.265754,-0.317496,0.118696,-0.136665,-0.301569,-0.403023,0.124804,-0.036470,0.311931,0.660457,...,-0.565096,-0.088634,0.122977,-0.047931,0.141804,0.129054,-0.028819,-0.028773,-0.253627,-0.752646
ASG002_BC3C_24H:A05,-0.181954,-0.081597,-0.210304,1.559535,-0.019019,-0.457423,0.071404,0.074080,-0.356119,0.498808,...,0.226104,-0.228034,-0.121023,-0.075331,-0.133146,0.355054,0.022831,-0.084073,0.283123,-0.894896
ASG002_BC3C_24H:A06,0.033446,0.042404,-0.150154,-0.093165,0.053180,-0.053823,0.087704,0.167681,-0.601569,0.383308,...,-0.608596,-0.228835,0.072777,0.082970,-0.570996,2.847754,-0.211670,-0.067273,0.081723,0.338704
ASG002_BC3C_24H:J13,0.204446,0.180704,0.089096,-0.054666,0.053381,0.044877,-0.277396,-0.157419,0.535681,-3.933493,...,-0.318397,0.122265,-0.134323,-0.088931,-0.067996,-0.515847,-0.005069,0.067527,0.002223,0.204904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MOAR012_BC3C_24H:P20,0.647151,0.211700,-0.979200,0.597350,-0.375751,0.388300,0.394524,0.120151,-0.166775,-1.129125,...,-1.598475,-0.552750,0.515151,0.120800,0.082675,0.529700,0.383225,-0.207225,2.268450,-1.248500
MOAR012_BC3C_24H:P21,0.171800,0.046300,-0.145550,-0.295150,0.030849,0.420951,0.222075,0.179800,0.274724,-0.423975,...,-1.650575,0.203600,-0.003250,-0.064800,-0.037675,0.076499,0.201825,0.416875,0.287450,-0.971700
MOAR012_BC3C_24H:P22,0.648700,0.058749,-0.031700,0.408249,-0.753950,0.332200,-0.357525,-0.107650,-0.213575,0.074225,...,0.127625,0.031600,0.103250,-0.249600,0.046375,1.486200,0.440325,0.090075,-0.031650,-0.944300
MOAR012_BC3C_24H:P23,0.090499,-0.469300,-0.611800,0.873550,-0.788450,-0.097199,-0.366575,-0.490600,-0.624675,-0.009275,...,0.054676,-0.596050,0.084600,0.444700,0.431375,-0.921501,0.044926,0.716076,-0.000900,-1.106700


In [38]:
Data_norm_df = Data_norm_df[Data_norm_df.index.isin(exp_ids)]

# arrange experiments in same order as in list
Data_norm_df["sort_col"] = Data_norm_df.index.map({val: i for i, val in enumerate(exp_ids)})
Data_norm_df = Data_norm_df.sort_values("sort_col")
Data_norm_df = Data_norm_df.drop("sort_col", axis = 1)

# transpose
Data_norm_df = Data_norm_df.T

display(Data_norm_df)

Unnamed: 0,ASG002_BC3C_24H:A10,ASG002_BC3C_24H:A11,ASG002_BC3C_24H:A19,ASG002_BC3C_24H:A20,ASG002_BC3C_24H:A21,ASG002_BC3C_24H:B10,ASG002_BC3C_24H:B11,ASG002_BC3C_24H:B12,ASG002_BC3C_24H:B14,ASG002_BC3C_24H:B15,...,MOAR010_BC3C_24H:K09,MOAR010_BC3C_24H:L19,MOAR010_BC3C_24H:L21,MOAR011_BC3C_24H:C01,MOAR011_BC3C_24H:C02,MOAR011_BC3C_24H:C03,MOAR011_BC3C_24H:C10,MOAR011_BC3C_24H:C11,MOAR011_BC3C_24H:F07,MOAR011_BC3C_24H:F08
AARS,-0.496854,0.288446,0.189747,-0.016454,0.080746,0.282346,0.326246,0.244946,0.303046,0.387546,...,-0.217618,-0.074268,-0.004018,0.215100,-0.178100,-0.007000,0.024000,0.007400,0.584300,0.114500
ABCB6,-0.658596,-0.142196,-0.075397,-0.383796,-0.199996,-0.074197,0.108804,0.055204,-0.399196,-0.227496,...,0.202852,-0.003798,0.326502,0.127850,0.081150,-0.026850,-0.152851,0.122550,0.006950,-0.037150
ABCC5,-0.080204,0.231996,-0.329354,-0.225204,0.278446,0.034696,0.396596,0.288746,-0.255704,-0.254154,...,0.011493,0.187043,0.358943,0.197500,0.119350,0.212450,-0.177250,0.138650,-0.238750,0.409550
ABCF1,0.202535,0.602335,0.403335,0.313134,-0.083265,-0.056365,-0.387216,0.104534,0.097235,0.540634,...,-0.458439,1.132361,-0.556689,-0.036825,-0.080325,0.059175,0.307175,0.580075,0.142675,0.077875
ABCF3,-0.520919,-0.192819,0.001032,-0.096419,0.210881,-0.731118,0.095381,0.049931,0.313481,-0.078018,...,0.201432,-0.028568,0.610582,-0.378251,0.003950,0.127300,-0.197401,-0.128351,-0.468050,-0.140650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF395,1.796254,1.773154,0.668354,0.825354,0.350654,0.205954,-0.998346,-0.458247,0.235354,0.230653,...,-0.285181,-1.188080,-1.175880,0.300250,-0.265350,-0.380850,-0.330950,-1.844650,0.257250,-1.828550
ZNF451,-0.244519,0.116732,0.058081,-0.178169,-0.080619,-0.139119,0.064681,-0.237719,-0.083019,-0.272119,...,0.085807,0.107632,0.105732,-0.324500,0.080550,-0.114950,-0.337650,-0.024950,0.080650,0.119950
ZNF586,0.097627,0.061027,-0.337573,0.098427,-0.338173,0.017627,-0.197423,-0.155173,-0.078473,-0.300473,...,0.056085,-0.005365,-0.191564,-0.304650,-0.259400,0.163600,-0.428700,-0.069200,-0.561900,0.202100
ZNF589,0.608573,0.106123,-0.014126,-0.003677,-0.123477,-0.243377,0.044623,0.092123,-0.021477,0.201823,...,-0.200241,-0.027441,-0.072891,-0.315500,-0.013900,-0.035050,1.836400,-0.223300,-0.557600,-0.364600


In [39]:
genes = Data_norm_df.index.tolist()
print('Landmark genes list: ', len(genes), genes)

n_genes = len(genes)

Landmark genes list:  978 ['AARS', 'ABCB6', 'ABCC5', 'ABCF1', 'ABCF3', 'ABHD4', 'ABHD6', 'ABL1', 'ACAA1', 'ACAT2', 'ACBD3', 'ACD', 'ACLY', 'ACOT9', 'ADAM10', 'ADAT1', 'ADGRE5', 'ADGRG1', 'ADH5', 'ADI1', 'ADO', 'ADRB2', 'AGL', 'AKAP8', 'AKAP8L', 'AKR7A2', 'AKT1', 'ALAS1', 'ALDH7A1', 'ALDOA', 'ALDOC', 'AMDHD2', 'ANKRD10', 'ANO10', 'ANXA7', 'APBB2', 'APOE', 'APP', 'APPBP2', 'ARFIP2', 'ARHGAP1', 'ARHGEF12', 'ARHGEF2', 'ARID4B', 'ARID5B', 'ARL4C', 'ARNT2', 'ARPP19', 'ASAH1', 'ASCC3', 'ATF1', 'ATF5', 'ATF6', 'ATG3', 'ATMIN', 'ATP11B', 'ATP1B1', 'ATP2C1', 'ATP6V0B', 'ATP6V1D', 'AURKA', 'AURKB', 'AXIN1', 'B4GAT1', 'BACE2', 'BAD', 'BAG3', 'BAMBI', 'BAX', 'BCL2', 'BCL7B', 'BDH1', 'BECN1', 'BHLHE40', 'BID', 'BIRC2', 'BIRC5', 'BLCAP', 'BLMH', 'BLVRA', 'BMP4', 'BNIP3', 'BNIP3L', 'BPHL', 'BRCA1', 'BTK', 'BUB1B', 'BZW2', 'C2CD2', 'C2CD2L', 'C2CD5', 'C5', 'CAB39', 'CALM3', 'CALU', 'CAMSAP2', 'CANT1', 'CAPN1', 'CARMIL1', 'CASC3', 'CASK', 'CASP10', 'CASP2', 'CASP3', 'CASP7', 'CAST', 'CAT', 'CBLB', 'CBR1

## Inhibitor concentrations, IC50, and perturbation matrices

In [40]:
inhib_conc_matrix = np.zeros((n_modules, n_experiments))
ic50_matrix = np.ones((n_modules, n_experiments))
gamma_matrix = np.zeros((n_modules, n_experiments))

In [41]:

for i, module in enumerate(modules):
    drugs_for_module = IC50_df.Drug[IC50_df.Module == module].tolist()
    for drug in drugs_for_module:
        # get IC50 for this drug
        ic50 = IC50_df.IC50[IC50_df.Drug == drug].values
#       gamma = IC50_df.Gamma[IC50_df.Drug == drug].values
        print(drug, ic50)
        assert ic50.size == 1
#       assert gamma.size == 1
        # get experiments with this drug
        exp_with_drug = sig_info_df.index[sig_info_df.pert_drug == drug].tolist()
        print(exp_with_drug) 
        for exp_id in exp_with_drug:
            j = exp_ids.index(exp_id)
            print(j)
            # extract inhibitor concentration
            inhib_conc = sig_info_df.dose_float[sig_info_df.index == exp_id].values
            assert inhib_conc.size == 1
            # insert values in matrices
            inhib_conc_matrix[i, j] = inhib_conc.item()
            ic50_matrix[i, j] = ic50.item()
#           gamma_matrix[i, j] = gamma.item()


flufenamic-acid [3.]
['MOAR008_BC3C_24H:L01', 'MOAR008_BC3C_24H:L02']
67
68
nandrolone [9.]
[]
oxandrolone [190.3]
['MOAR011_BC3C_24H:C01', 'MOAR011_BC3C_24H:C02', 'MOAR011_BC3C_24H:C03']
83
84
85
testosterone-enanthate [200.]
['MOAR011_BC3C_24H:C10', 'MOAR011_BC3C_24H:C11']
86
87
testosterone-propionate [124.]
['MOAR010_BC3C_24H:L19', 'MOAR010_BC3C_24H:L21']
81
82
JNJ-7706621 [0.027]
['ASG002_BC3C_24H:N13', 'ASG002_BC3C_24H:N14', 'ASG002_BC3C_24H:N15']
49
50
51
PHA-793887 [0.18]
['ASG002_BC3C_24H:L01', 'ASG002_BC3C_24H:L03']
42
43
roscovitine [2.]
['ASG002_BC3C_24H:E22', 'ASG002_BC3C_24H:E23', 'ASG002_BC3C_24H:E24']
19
20
21
alvocidib [0.12]
['ASG002_BC3C_24H:F06']
24
palbociclib [0.045]
['ASG002_BC3C_24H:P16', 'ASG002_BC3C_24H:P17', 'ASG002_BC3C_24H:P18']
63
64
65
afatinib [0.03]
['ASG002_BC3C_24H:N22', 'ASG002_BC3C_24H:N23']
53
54
erlotinib [0.006]
['ASG002_BC3C_24H:H16', 'ASG002_BC3C_24H:H17', 'ASG002_BC3C_24H:H18']
33
34
35
gefitinib [0.171]
['ASG002_BC3C_24H:F16', 'ASG002_BC3C_24

In [42]:
# transform matrices into pandas dfs for export with row and column names
inhib_conc_df = pd.DataFrame(inhib_conc_matrix, index = modules, columns = exp_ids)
ic50_df = pd.DataFrame(ic50_matrix, index = modules, columns = exp_ids)
# gamma_df = pd.DataFrame(gamma_matrix, index = modules, columns = exp_ids)

# create binary perturbation matrix
pert_df = pd.DataFrame(
    np.where(inhib_conc_matrix != 0, 1, 0),
    index = inhib_conc_df.index,
    columns = inhib_conc_df.columns,
)

In [43]:
display(ic50_df)
# display(gamma_df)
display(inhib_conc_df)
display(pert_df)

Unnamed: 0,ASG002_BC3C_24H:A10,ASG002_BC3C_24H:A11,ASG002_BC3C_24H:A19,ASG002_BC3C_24H:A20,ASG002_BC3C_24H:A21,ASG002_BC3C_24H:B10,ASG002_BC3C_24H:B11,ASG002_BC3C_24H:B12,ASG002_BC3C_24H:B14,ASG002_BC3C_24H:B15,...,MOAR010_BC3C_24H:K09,MOAR010_BC3C_24H:L19,MOAR010_BC3C_24H:L21,MOAR011_BC3C_24H:C01,MOAR011_BC3C_24H:C02,MOAR011_BC3C_24H:C03,MOAR011_BC3C_24H:C10,MOAR011_BC3C_24H:C11,MOAR011_BC3C_24H:F07,MOAR011_BC3C_24H:F08
Androgen,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,124.0,124.0,190.3,190.3,190.3,200.0,200.0,1.0,1.0
CDK1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDK2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDK4_6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
EGFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Estrogen,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FGFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PI3K,0.00262,0.00262,0.1595,0.1595,0.1595,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
p53,1.0,1.0,1.0,1.0,1.0,0.0018,0.0018,0.0018,9.75,9.75,...,0.0264,1.0,1.0,1.0,1.0,1.0,1.0,1.0,62.15,62.15
TOP2A,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,ASG002_BC3C_24H:A10,ASG002_BC3C_24H:A11,ASG002_BC3C_24H:A19,ASG002_BC3C_24H:A20,ASG002_BC3C_24H:A21,ASG002_BC3C_24H:B10,ASG002_BC3C_24H:B11,ASG002_BC3C_24H:B12,ASG002_BC3C_24H:B14,ASG002_BC3C_24H:B15,...,MOAR010_BC3C_24H:K09,MOAR010_BC3C_24H:L19,MOAR010_BC3C_24H:L21,MOAR011_BC3C_24H:C01,MOAR011_BC3C_24H:C02,MOAR011_BC3C_24H:C03,MOAR011_BC3C_24H:C10,MOAR011_BC3C_24H:C11,MOAR011_BC3C_24H:F07,MOAR011_BC3C_24H:F08
Androgen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,1.11,10.0,3.33,1.11,10.0,3.33,0.0,0.0
CDK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK4_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EGFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Estrogen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FGFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PI3K,10.0,1.11,10.0,1.11,0.12,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p53,0.0,0.0,0.0,0.0,0.0,10.0,1.11,0.12,1.11,0.08,...,1.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,3.33
TOP2A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,ASG002_BC3C_24H:A10,ASG002_BC3C_24H:A11,ASG002_BC3C_24H:A19,ASG002_BC3C_24H:A20,ASG002_BC3C_24H:A21,ASG002_BC3C_24H:B10,ASG002_BC3C_24H:B11,ASG002_BC3C_24H:B12,ASG002_BC3C_24H:B14,ASG002_BC3C_24H:B15,...,MOAR010_BC3C_24H:K09,MOAR010_BC3C_24H:L19,MOAR010_BC3C_24H:L21,MOAR011_BC3C_24H:C01,MOAR011_BC3C_24H:C02,MOAR011_BC3C_24H:C03,MOAR011_BC3C_24H:C10,MOAR011_BC3C_24H:C11,MOAR011_BC3C_24H:F07,MOAR011_BC3C_24H:F08
Androgen,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,0,0
CDK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK4_6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
EGFR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Estrogen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FGFR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PI3K,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p53,0,0,0,0,0,1,1,1,1,1,...,1,0,0,0,0,0,0,0,1,1
TOP2A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Global responses for DPD modules

According to our discussion, $R$ for DPD vectors can not be calculated with the same formula as for pathway activities. Instead we are assuming:

\begin{equation}
R_{DPD},j = DPD = STV_{DPD} \cdot Data_j
\end{equation}

In [44]:
# load STV data frame
STVs = pd.read_excel(os.path.join(info_dir, "ALL_DATA_2020_Jing.xlsx"), sheet_name = "STV", index_col = 0)
STV_df = pd.DataFrame(np.zeros((len(Data_norm_df.index), 3)), index = Data_norm_df.index, columns = STVs.columns)
STV_df.loc[STVs.index] = STVs

display(STV_df)

Unnamed: 0,blca_invasiveness,blca_oncogenesis,blca_survival
AARS,0.016052,0.078487,0.000000
ABCB6,0.001612,0.027125,-2.890931
ABCC5,-0.016742,0.002121,0.000000
ABCF1,-0.038990,-0.028959,-0.496186
ABCF3,0.023288,0.036671,0.000000
...,...,...,...
ZNF395,0.000000,0.000000,0.000000
ZNF451,0.011450,0.013403,0.000000
ZNF586,0.000000,0.000000,0.119103
ZNF589,0.000000,0.000000,0.000000


In [45]:
# create empty DPD data frame
DPD_df = pd.DataFrame(
    np.zeros((len(Data_norm_df.columns), len(STV_df.columns))),
    index = Data_norm_df.columns,
    columns = STV_df.columns,
)

# populate
for exp_id in DPD_df.index:
    for state in STV_df.columns:
        DPD_df.loc[exp_id, state] = np.dot(Data_norm_df.T.loc[exp_id], STV_df.loc[:, state])

display(DPD_df)

Unnamed: 0,blca_invasiveness,blca_oncogenesis,blca_survival
ASG002_BC3C_24H:A10,0.651175,-0.610650,-17.377163
ASG002_BC3C_24H:A11,-0.037418,-0.936049,-5.764659
ASG002_BC3C_24H:A19,0.483620,-0.048496,9.872480
ASG002_BC3C_24H:A20,0.405943,-0.647673,-0.584775
ASG002_BC3C_24H:A21,0.466195,0.031650,-6.200738
...,...,...,...
MOAR011_BC3C_24H:C03,0.320511,-0.068887,5.703725
MOAR011_BC3C_24H:C10,0.401778,-0.875167,9.754974
MOAR011_BC3C_24H:C11,0.426267,-0.261326,-4.383434
MOAR011_BC3C_24H:F07,1.080049,0.259917,6.540675


In [46]:
# transform to R global
R_global_DPD_df = DPD_df.T
display(R_global_DPD_df)

Unnamed: 0,ASG002_BC3C_24H:A10,ASG002_BC3C_24H:A11,ASG002_BC3C_24H:A19,ASG002_BC3C_24H:A20,ASG002_BC3C_24H:A21,ASG002_BC3C_24H:B10,ASG002_BC3C_24H:B11,ASG002_BC3C_24H:B12,ASG002_BC3C_24H:B14,ASG002_BC3C_24H:B15,...,MOAR010_BC3C_24H:K09,MOAR010_BC3C_24H:L19,MOAR010_BC3C_24H:L21,MOAR011_BC3C_24H:C01,MOAR011_BC3C_24H:C02,MOAR011_BC3C_24H:C03,MOAR011_BC3C_24H:C10,MOAR011_BC3C_24H:C11,MOAR011_BC3C_24H:F07,MOAR011_BC3C_24H:F08
blca_invasiveness,0.651175,-0.037418,0.48362,0.405943,0.466195,-0.079592,0.665998,0.83643,0.403368,0.550261,...,-0.308377,-0.257099,-0.853594,0.294934,0.513808,0.320511,0.401778,0.426267,1.080049,0.424453
blca_oncogenesis,-0.61065,-0.936049,-0.048496,-0.647673,0.03165,0.185978,0.911126,0.222657,-0.079387,-0.459741,...,-0.517466,0.295985,0.785934,0.039863,0.106842,-0.068887,-0.875167,-0.261326,0.259917,0.769415
blca_survival,-17.377163,-5.764659,9.87248,-0.584775,-6.200738,1.836483,-3.060844,-22.856785,-3.5814,1.041249,...,-8.942985,7.883618,11.324876,7.548515,1.06482,5.703725,9.754974,-4.383434,6.540675,12.596146


## Save outputs

In [47]:
# save metadata as pickle
all_metadata = {
    "modules": modules,
    "n_modules": n_modules,
    "drugs": drugs,
    "n_drugs": n_drugs,
    "exp_ids": exp_ids,
    "n_experiments": n_experiments,
    "genes": genes,
    "n_genes": n_genes,
}

print(all_metadata)

with open(os.path.join(out_dir, "metadata.pickle"), "wb") as f:
    pickle.dump(all_metadata, f, protocol = pickle.HIGHEST_PROTOCOL)

{'modules': ['Androgen', 'CDK1', 'CDK2', 'CDK4_6', 'EGFR', 'Estrogen', 'FGFR', 'PI3K', 'p53', 'TOP2A'], 'n_modules': 10, 'drugs': ['NVP-BEZ235', 'daunorubicin', 'nandrolone', 'nutlin-3', 'oxandrolone', 'RITA', 'PHA-793887', 'serdemetan', 'idarubicin', 'raloxifene', 'mitoxantrone', 'KU-0063794', 'LY-294002', 'testosterone-enanthate', 'flufenamic-acid', 'testosterone-propionate', 'SAR405838', 'lapatinib', 'masitinib', 'sorafenib', 'PI-103', 'HLI-373', 'alvocidib', 'gefitinib', 'dienestrol', 'JNJ-7706621', 'roscovitine', 'GDC-0349', 'palbociclib', 'AMG-232', 'erlotinib', 'ponatinib', 'AZD-8055', 'afatinib', 'vandetanib', 'taselisib', 'estradiol-cypionate', 'epirubicin', 'AS-605240'], 'n_drugs': 39, 'exp_ids': ['ASG002_BC3C_24H:A10', 'ASG002_BC3C_24H:A11', 'ASG002_BC3C_24H:A19', 'ASG002_BC3C_24H:A20', 'ASG002_BC3C_24H:A21', 'ASG002_BC3C_24H:B10', 'ASG002_BC3C_24H:B11', 'ASG002_BC3C_24H:B12', 'ASG002_BC3C_24H:B14', 'ASG002_BC3C_24H:B15', 'ASG002_BC3C_24H:C13', 'ASG002_BC3C_24H:C14', 'ASG002

In [48]:
# save doses and perturbation matrix
inhib_conc_df.to_csv(os.path.join(out_dir, "inhib_conc_annotated.csv"))
ic50_df.to_csv(os.path.join(out_dir, "ic50_annotated.csv"))
# gamma_df.to_csv(os.path.join(out_dir, "gamma_annotated.csv"))
pert_df.to_csv(os.path.join(out_dir, "pert_annotated.csv"))

In [49]:
# save log fold change L1000 data
Data_norm_df.to_csv(os.path.join(out_dir, "L1000_Data_norm_data.csv"))

In [50]:
# save R_global for DPDs
R_global_DPD_df.to_csv(os.path.join(out_dir, "R_global_DPDonly_annotated.csv"))