# 01 Data preparation

Processing xlsx files from data folder into suitable inputs and generate other input files

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

In [2]:


data_dir = '/home/jing/Phd_project/project_GBM/gbm_Scripts/gbm_Scripts_bmra_u251mg/00_outputs_2020_U251MG/'
info_dir = data_dir


out_dir = "01_outputs_2020"

os.makedirs(out_dir, exist_ok = True)


## Modules

Load data about modules and drugs.

In [3]:
### DATA
### remove whitespaces in names (modules), remove duplicates

modules_df = pd.read_excel(os.path.join(data_dir,'ALL_DATA_2020_Jing_u251mg.xlsx'), sheet_name = "modules", index_col = 0)
# display(modules_df)

selected_modules = modules_df.index.tolist()
print(len(selected_modules), ' - Size after reading')

# remove duplicates
selected_modules = modules_df.index.unique().tolist()
print(len(selected_modules), ' - Size after remove duplicates')

# remove whitespaces in modules' names 
selected_modules = [d.strip() for d in selected_modules]

print('Selected modules list: ', len(selected_modules), selected_modules)


10  - Size after reading
10  - Size after remove duplicates
Selected modules list:  10 ['CDK2', 'CDK4_6', 'p53', 'EGFR', 'Aurora', 'Estrogen', 'PDGFR', 'Hypoxia', 'ERK', 'PI3K']


In [4]:
### DATA
### remove whitespaces in names (modules, drugs), remove duplicates
### check the dimensions of the indicator IC50 1 uM = 1000 nM
### copy-paste as values, numbers, no formulas

IC50_df = pd.read_excel(os.path.join(data_dir, "ALL_DATA_2020_Jing_u251mg.xlsx"), sheet_name = "IC50s")
IC50_df.drop(columns=['Notes','Removed'],inplace=True)

print(len(IC50_df.index), ' - Size after reading')
# display(IC50_df)

# rename
IC50_df = IC50_df.rename(columns = {"IC50, uM": "IC50"})

# manually correcting value IC50, Example  for IOX2 -> 30 nM  
#IC50_df.loc[IC50_df.index == 'IOX2', IC50_df.columns == 'IC50'] = 30/1000

# remove non-selected modules, modules' names with whitespaces or empty
IC50_df = IC50_df[IC50_df.Module.isin(selected_modules)]

print(len(IC50_df.index), ' - Size after remove modules')
# display(IC50_df)

# remove duplicates 
# Considering certain columns is optional. 
# Indexes, including time indexes are ignored.
IC50_df = IC50_df.drop_duplicates()

print(len(IC50_df.index), ' - Size after remove duplicates')
display(IC50_df)

26  - Size after reading
26  - Size after remove modules
26  - Size after remove duplicates


Unnamed: 0,Drug,Module,IC50,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,roscovitine,CDK2,2.0,,,
1,palbociclib,CDK4_6,0.045,CDK4_6,0.12,
2,LY-294002,PI3K,1.36,PI3K,0.1595,
3,NVP-BEZ235,PI3K,1.71,PI3K,3.0,
4,taselisib,PI3K,0.00262,PI3K,0.09,30nm
5,AZ-628,ERK,3.0,ERK,0.015,martin's
6,dabrafenib,ERK,0.1,ERK,0.00408,0.92 1.8
7,selumetinib,ERK,0.643,EGFR,1.5,500 nm
8,vemurafenib,ERK,0.1185,Hypoxia,0.0021,"0,7nm"
9,dienestrol,Estrogen,200.0,Hypoxia,13.2,4.4 μM


In [5]:
modules = IC50_df.Module.unique().tolist()

print('IC50_df  modules list: ', len(modules), modules)
print()
print('Selected modules list: ', len(selected_modules), selected_modules)

### CHECK
print()
print('CHECK: ', len(selected_modules),'=?', len(modules))

n_modules = len(modules)


IC50_df  modules list:  10 ['CDK2', 'CDK4_6', 'PI3K', 'ERK', 'Estrogen', 'EGFR', 'Hypoxia', 'PDGFR', 'Aurora', 'p53']

Selected modules list:  10 ['CDK2', 'CDK4_6', 'p53', 'EGFR', 'Aurora', 'Estrogen', 'PDGFR', 'Hypoxia', 'ERK', 'PI3K']

CHECK:  10 =? 10


In [6]:
drugs = IC50_df.Drug.tolist()
print(len(drugs), ' - Size after reading')

# remove duplicates
drugs = IC50_df.Drug.unique().tolist()
print(len(drugs), ' - Size after remove duplicates')

# remove whitespaces in drugs' names (necessary for some)
drugs = [d.strip() for d in drugs]

# remove duplicates after remove whitespaces
drugs = list(set(drugs))
print(len(drugs), ' - Size after remove duplicates without whitespaces')

print('Drugs list: ', len(drugs), drugs)

n_drugs = len(drugs)

26  - Size after reading
26  - Size after remove duplicates
26  - Size after remove duplicates without whitespaces
Drugs list:  26 ['vorinostat', 'raloxifene', 'dienestrol', 'erlotinib', 'selumetinib', 'taselisib', 'masitinib', 'nutlin-3', 'NVP-BEZ235', 'panobinostat', 'entinostat', 'roscovitine', 'tandutinib', 'vemurafenib', 'LY-294002', 'dabrafenib', 'tozasertib', 'estradiol-cypionate', 'palbociclib', 'imatinib', 'AMG-232', 'AZ-628', 'gefitinib', 'barasertib-HQPA', 'afatinib', 'RITA']


## L1000 meta data

Get sig_id for selected drugs.

In [7]:
sig_info_df = pd.read_excel(os.path.join(data_dir, "sig_info_2020_U251MG.xlsx"), index_col = 0)

display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_U251MG_24H:A03,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:A03,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_U251MG_24H:A04,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:A04,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_U251MG_24H:A05,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:A05,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_U251MG_24H:J13,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:J13,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_U251MG_24H:J14,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:J14,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
...,...,...,...,...,...,...,...,...,...,...,...
ASG002_U251MG_24H:P20,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:P20,1,trt_cp,idarubicin,TOP2A,1,1.11 uM,1.11
ASG002_U251MG_24H:P21,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:P21,1,trt_cp,idarubicin,TOP2A,1,0.12 uM,0.12
ASG002_U251MG_24H:P22,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:P22,1,trt_cp,digitoxin,ATP1A1,1,10 uM,10.00
ASG002_U251MG_24H:P23,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:P23,1,trt_cp,digitoxin,ATP1A1,1,1.11 uM,1.11


In [8]:
# now filtering so only the required drugs are present
sig_info_df = sig_info_df.loc[sig_info_df.pert_drug.isin(drugs)]

# here's what we have now
display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_U251MG_24H:A10,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:A10,1,trt_cp,taselisib,PIK3CA,1,10 uM,10.00
ASG002_U251MG_24H:A11,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:A11,1,trt_cp,taselisib,PIK3CA,1,1.11 uM,1.11
ASG002_U251MG_24H:A12,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:A12,1,trt_cp,taselisib,PIK3CA,1,0.12 uM,0.12
ASG002_U251MG_24H:B10,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:B10,1,trt_cp,AMG-232,MDM2,1,10 uM,10.00
ASG002_U251MG_24H:B11,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:B11,1,trt_cp,AMG-232,MDM2,1,1.11 uM,1.11
...,...,...,...,...,...,...,...,...,...,...,...
ASG002_U251MG_24H:O11,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:O11,1,trt_cp,raloxifene,"ESR1, ESR2",2,1.11 uM,1.11
ASG002_U251MG_24H:O12,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:O12,1,trt_cp,raloxifene,"ESR1, ESR2",2,0.12 uM,0.12
ASG002_U251MG_24H:P16,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:P16,1,trt_cp,palbociclib,"CDK4, CDK6",2,10 uM,10.00
ASG002_U251MG_24H:P17,U251MG,ASG002,24 h,ASG002_U251MG_24H_X1_B35:P17,1,trt_cp,palbociclib,"CDK4, CDK6",2,1.11 uM,1.11


Manually remove  few inhibition from data set, since it does differ from the other data points.

inhib_to_filter = "PF-03758309"
dose_to_filter = 10

id_to_filter = sig_info_df[
    np.logical_and(
        sig_info_df.pert_drug == inhib_to_filter,
        sig_info_df.dose_float == dose_to_filter,
    )
].index.values

print(id_to_filter)

sig_info_df = sig_info_df[~sig_info_df.index.isin(id_to_filter)]
display(sig_info_df)

inhib_to_filter = "roscovitine"
dose_to_filter = 3.33

id_to_filter = sig_info_df[
    np.logical_and(
        sig_info_df.pert_drug == inhib_to_filter,
        sig_info_df.dose_float == dose_to_filter,
    )
].index.values

print(id_to_filter)

sig_info_df = sig_info_df[~sig_info_df.index.isin(id_to_filter)]
display(sig_info_df)

In [9]:
exp_ids = sig_info_df.index.unique().tolist()
print('Experiments ids list: ', len(exp_ids))

n_experiments = len(exp_ids)

Experiments ids list:  77


Confirm data by checking the drugs of interest against the filtered L1000 meta data.

In [10]:
print(f"Number of drugs of interest:\t{len(drugs)}")
#print(f'Number of drugs in L1000 data:\t{len(sig_info_df.value_counts("drugs"))}')

#sig_info_df.value_counts("drugs")

Number of drugs of interest:	26


## L1000 data

In [11]:
Data_norm_df = pd.read_excel(os.path.join(data_dir, "Data_norm_2020_U251MG.xlsx"), index_col = 0)
display(Data_norm_df)

Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
ASG002_U251MG_24H:A03,-0.178654,-0.106862,-0.053731,-0.205517,0.046979,-0.194327,-0.089152,-0.179361,0.580473,-0.167862,...,1.102416,0.583696,-0.301771,-0.348191,-0.085294,0.432292,0.038529,0.056377,0.169081,0.546863
ASG002_U251MG_24H:A04,-0.248680,-0.225963,-0.163556,0.102533,-0.094521,-0.197752,-0.163377,-0.010686,0.259747,-0.332512,...,0.538267,-0.496454,-0.205721,-0.418292,-0.023744,-0.851458,0.069729,0.042527,0.191131,0.021112
ASG002_U251MG_24H:A05,-0.318979,-0.134862,0.162119,0.012683,0.236779,0.052273,-0.033902,0.260514,0.662598,0.126488,...,-0.358134,-0.290955,0.919129,-0.370292,0.109456,-0.814808,-0.214571,0.144152,-0.026894,0.258962
ASG002_U251MG_24H:J13,-0.017829,-0.109138,-0.439906,-0.024517,-0.276021,-0.153127,0.154698,-0.142136,-0.579652,-0.060863,...,0.318117,0.102971,-0.366371,0.459958,-0.106243,3.750292,-0.208071,0.016977,-0.028694,-1.352387
ASG002_U251MG_24H:J14,0.080071,0.071262,0.019169,-0.044467,-0.038345,0.172273,0.086048,-0.017986,0.383748,0.217487,...,-0.775883,-0.376554,-0.269170,0.549808,-0.072744,-0.168908,0.191229,-0.164898,0.107981,0.059913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ASG002_U251MG_24H:P20,-1.165779,-1.174662,-0.033031,0.864033,-2.100671,-0.869127,0.980848,-0.957510,-2.027652,-0.916212,...,-1.323284,-1.647129,-0.519096,0.569209,0.038007,3.672717,-0.317571,-0.919523,0.538206,-0.492037
ASG002_U251MG_24H:P21,2.698371,-0.052563,0.054318,0.396834,0.475929,0.169673,-0.033352,0.673964,0.264697,-0.116463,...,-1.609534,-0.306254,1.567429,-0.415591,0.827707,-1.394509,0.763679,0.609552,0.484657,-0.194987
ASG002_U251MG_24H:P22,-2.207129,-1.624262,-0.820181,0.541859,-0.045420,-0.630427,-0.345902,1.146464,-1.532527,-1.898763,...,-1.565234,2.628596,1.788479,-0.665641,-0.261144,-2.291608,1.374029,1.721677,-1.342619,-0.320988
ASG002_U251MG_24H:P23,-1.404754,-2.019962,-0.160156,-0.557817,-0.016970,-0.422152,0.227673,0.505590,-1.000402,-0.751763,...,-0.537583,-0.185179,1.177080,-0.563542,-0.071394,-2.813858,-0.135696,0.358527,-0.698244,-1.047038


In [12]:
Data_norm_df = Data_norm_df[Data_norm_df.index.isin(exp_ids)]

# arrange experiments in same order as in list
Data_norm_df["sort_col"] = Data_norm_df.index.map({val: i for i, val in enumerate(exp_ids)})
Data_norm_df = Data_norm_df.sort_values("sort_col")
Data_norm_df = Data_norm_df.drop("sort_col", axis = 1)

# transpose
Data_norm_df = Data_norm_df.T

display(Data_norm_df)

Unnamed: 0,ASG002_U251MG_24H:A10,ASG002_U251MG_24H:A11,ASG002_U251MG_24H:A12,ASG002_U251MG_24H:B10,ASG002_U251MG_24H:B11,ASG002_U251MG_24H:B12,ASG002_U251MG_24H:B13,ASG002_U251MG_24H:B14,ASG002_U251MG_24H:B15,ASG002_U251MG_24H:C13,...,ASG002_U251MG_24H:M24,ASG002_U251MG_24H:N22,ASG002_U251MG_24H:N23,ASG002_U251MG_24H:N24,ASG002_U251MG_24H:O10,ASG002_U251MG_24H:O11,ASG002_U251MG_24H:O12,ASG002_U251MG_24H:P16,ASG002_U251MG_24H:P17,ASG002_U251MG_24H:P18
AARS,-0.494279,0.151221,0.044521,0.206571,0.166671,-0.091379,-0.026729,0.303671,0.039821,0.217921,...,-0.098579,3.022271,-0.015680,-0.099629,0.299922,0.277471,0.096071,0.832671,0.220846,0.072772
ABCB6,-1.826287,-0.041488,0.178438,0.110487,-0.029463,-0.285112,0.090837,-0.044662,0.171337,0.070188,...,0.333588,-0.760087,0.247288,0.141338,0.333538,0.217837,0.230188,0.000463,0.205563,0.187887
ABCC5,0.031419,0.078144,0.103768,-0.010331,-0.374056,-0.301856,-0.305206,-0.094331,0.370169,0.024319,...,0.283194,-0.002106,0.119519,0.162919,0.121569,0.302919,0.052144,0.758819,0.120744,0.103768
ABCF1,0.590333,0.201883,-0.273517,0.330033,0.056033,0.299683,1.457133,0.486584,-0.138317,0.370433,...,-0.500867,0.157483,-0.412817,-0.505992,-0.285667,-0.235766,-0.395367,-0.099417,-0.272317,-0.077166
ABCF3,-1.080121,-0.267320,-0.074120,0.028229,-0.318671,-0.041821,-0.247721,0.192779,0.345180,0.213029,...,0.143029,0.150079,0.041629,0.122729,0.256479,-0.114871,0.002480,0.220430,0.331180,-0.179221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF395,3.018842,-0.629308,-0.971758,-0.667733,0.666241,0.975591,1.503742,2.084791,-0.389408,0.517892,...,-0.284808,-1.416209,-2.621309,-1.658833,-2.677783,-1.649109,-1.658758,0.383192,-0.587058,-0.025608
ZNF451,-0.510021,-0.131121,0.029579,-0.281321,-0.420571,0.046280,-0.039596,-0.289721,-0.004921,-0.025370,...,-0.205671,0.489030,-0.137821,0.168030,-0.069921,-0.337121,-0.003870,-0.479195,-0.062521,-0.240771
ZNF586,-0.367248,0.254077,0.146027,-0.032673,-0.160773,0.038127,-0.191123,-0.135073,-0.062773,0.054427,...,-0.070823,0.307402,-0.167823,-0.037523,-0.123373,0.188677,0.419677,-0.547323,-0.485098,-0.404173
ZNF589,-0.299019,-0.264469,-0.133344,-0.057144,-0.259994,0.077506,0.615782,0.044156,0.148156,-0.233594,...,-0.445244,0.290656,2.039806,-0.277594,-0.250894,0.449406,0.122156,0.033006,-0.325444,-0.072919


In [13]:
genes = Data_norm_df.index.tolist()
print('Landmark genes list: ', len(genes), genes)

n_genes = len(genes)

Landmark genes list:  978 ['AARS', 'ABCB6', 'ABCC5', 'ABCF1', 'ABCF3', 'ABHD4', 'ABHD6', 'ABL1', 'ACAA1', 'ACAT2', 'ACBD3', 'ACD', 'ACLY', 'ACOT9', 'ADAM10', 'ADAT1', 'ADGRE5', 'ADGRG1', 'ADH5', 'ADI1', 'ADO', 'ADRB2', 'AGL', 'AKAP8', 'AKAP8L', 'AKR7A2', 'AKT1', 'ALAS1', 'ALDH7A1', 'ALDOA', 'ALDOC', 'AMDHD2', 'ANKRD10', 'ANO10', 'ANXA7', 'APBB2', 'APOE', 'APP', 'APPBP2', 'ARFIP2', 'ARHGAP1', 'ARHGEF12', 'ARHGEF2', 'ARID4B', 'ARID5B', 'ARL4C', 'ARNT2', 'ARPP19', 'ASAH1', 'ASCC3', 'ATF1', 'ATF5', 'ATF6', 'ATG3', 'ATMIN', 'ATP11B', 'ATP1B1', 'ATP2C1', 'ATP6V0B', 'ATP6V1D', 'AURKA', 'AURKB', 'AXIN1', 'B4GAT1', 'BACE2', 'BAD', 'BAG3', 'BAMBI', 'BAX', 'BCL2', 'BCL7B', 'BDH1', 'BECN1', 'BHLHE40', 'BID', 'BIRC2', 'BIRC5', 'BLCAP', 'BLMH', 'BLVRA', 'BMP4', 'BNIP3', 'BNIP3L', 'BPHL', 'BRCA1', 'BTK', 'BUB1B', 'BZW2', 'C2CD2', 'C2CD2L', 'C2CD5', 'C5', 'CAB39', 'CALM3', 'CALU', 'CAMSAP2', 'CANT1', 'CAPN1', 'CARMIL1', 'CASC3', 'CASK', 'CASP10', 'CASP2', 'CASP3', 'CASP7', 'CAST', 'CAT', 'CBLB', 'CBR1

## Inhibitor concentrations, IC50, and perturbation matrices

In [14]:
inhib_conc_matrix = np.zeros((n_modules, n_experiments))
ic50_matrix = np.ones((n_modules, n_experiments))
gamma_matrix = np.zeros((n_modules, n_experiments))

In [15]:

for i, module in enumerate(modules):
    drugs_for_module = IC50_df.Drug[IC50_df.Module == module].tolist()
    for drug in drugs_for_module:
        # get IC50 for this drug
        ic50 = IC50_df.IC50[IC50_df.Drug == drug].values
#       gamma = IC50_df.Gamma[IC50_df.Drug == drug].values
        print(drug, ic50)
        assert ic50.size == 1
#       assert gamma.size == 1
        # get experiments with this drug
        exp_with_drug = sig_info_df.index[sig_info_df.pert_drug == drug].tolist()
        print(exp_with_drug) 
        for exp_id in exp_with_drug:
            j = exp_ids.index(exp_id)
            print(j)
            # extract inhibitor concentration
            inhib_conc = sig_info_df.dose_float[sig_info_df.index == exp_id].values
            assert inhib_conc.size == 1
            # insert values in matrices
            inhib_conc_matrix[i, j] = inhib_conc.item()
            ic50_matrix[i, j] = ic50.item()
#           gamma_matrix[i, j] = gamma.item()


roscovitine [2.]
['ASG002_U251MG_24H:E22', 'ASG002_U251MG_24H:E23', 'ASG002_U251MG_24H:E24']
24
25
26
palbociclib [0.045]
['ASG002_U251MG_24H:P16', 'ASG002_U251MG_24H:P17', 'ASG002_U251MG_24H:P18']
74
75
76
LY-294002 [1.36]
['ASG002_U251MG_24H:F01', 'ASG002_U251MG_24H:F02', 'ASG002_U251MG_24H:F03']
27
28
29
NVP-BEZ235 [1.71]
['ASG002_U251MG_24H:C16', 'ASG002_U251MG_24H:C17', 'ASG002_U251MG_24H:C18']
12
13
14
taselisib [0.00262]
['ASG002_U251MG_24H:A10', 'ASG002_U251MG_24H:A11', 'ASG002_U251MG_24H:A12']
0
1
2
AZ-628 [3.]
['ASG002_U251MG_24H:J19', 'ASG002_U251MG_24H:J20', 'ASG002_U251MG_24H:J21']
50
51
52
dabrafenib [0.1]
['ASG002_U251MG_24H:K22', 'ASG002_U251MG_24H:K23', 'ASG002_U251MG_24H:K24']
53
54
55
selumetinib [0.643]
['ASG002_U251MG_24H:J04', 'ASG002_U251MG_24H:J05', 'ASG002_U251MG_24H:J06']
44
45
46
vemurafenib [0.1185]
['ASG002_U251MG_24H:M07', 'ASG002_U251MG_24H:M08', 'ASG002_U251MG_24H:M09']
62
63
64
dienestrol [200.]
['ASG002_U251MG_24H:M22', 'ASG002_U251MG_24H:M23', 'ASG002

In [16]:
# transform matrices into pandas dfs for export with row and column names
inhib_conc_df = pd.DataFrame(inhib_conc_matrix, index = modules, columns = exp_ids)
ic50_df = pd.DataFrame(ic50_matrix, index = modules, columns = exp_ids)
# gamma_df = pd.DataFrame(gamma_matrix, index = modules, columns = exp_ids)

# create binary perturbation matrix
pert_df = pd.DataFrame(
    np.where(inhib_conc_matrix != 0, 1, 0),
    index = inhib_conc_df.index,
    columns = inhib_conc_df.columns,
)

In [17]:
display(ic50_df)
# display(gamma_df)
display(inhib_conc_df)
display(pert_df)

Unnamed: 0,ASG002_U251MG_24H:A10,ASG002_U251MG_24H:A11,ASG002_U251MG_24H:A12,ASG002_U251MG_24H:B10,ASG002_U251MG_24H:B11,ASG002_U251MG_24H:B12,ASG002_U251MG_24H:B13,ASG002_U251MG_24H:B14,ASG002_U251MG_24H:B15,ASG002_U251MG_24H:C13,...,ASG002_U251MG_24H:M24,ASG002_U251MG_24H:N22,ASG002_U251MG_24H:N23,ASG002_U251MG_24H:N24,ASG002_U251MG_24H:O10,ASG002_U251MG_24H:O11,ASG002_U251MG_24H:O12,ASG002_U251MG_24H:P16,ASG002_U251MG_24H:P17,ASG002_U251MG_24H:P18
CDK2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDK4_6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.045,0.045,0.045
PI3K,0.00262,0.00262,0.00262,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Estrogen,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,200.0,1.0,1.0,1.0,0.0171,0.0171,0.0171,1.0,1.0,1.0
EGFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.03,0.03,0.03,1.0,1.0,1.0,1.0,1.0,1.0
Hypoxia,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PDGFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Aurora,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
p53,1.0,1.0,1.0,0.0018,0.0018,0.0018,9.75,9.75,9.75,0.54,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,ASG002_U251MG_24H:A10,ASG002_U251MG_24H:A11,ASG002_U251MG_24H:A12,ASG002_U251MG_24H:B10,ASG002_U251MG_24H:B11,ASG002_U251MG_24H:B12,ASG002_U251MG_24H:B13,ASG002_U251MG_24H:B14,ASG002_U251MG_24H:B15,ASG002_U251MG_24H:C13,...,ASG002_U251MG_24H:M24,ASG002_U251MG_24H:N22,ASG002_U251MG_24H:N23,ASG002_U251MG_24H:N24,ASG002_U251MG_24H:O10,ASG002_U251MG_24H:O11,ASG002_U251MG_24H:O12,ASG002_U251MG_24H:P16,ASG002_U251MG_24H:P17,ASG002_U251MG_24H:P18
CDK2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK4_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.11,0.12
PI3K,10.0,1.11,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Estrogen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.12,0.0,0.0,0.0,10.0,1.11,0.12,0.0,0.0,0.0
EGFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,1.11,0.12,0.0,0.0,0.0,0.0,0.0,0.0
Hypoxia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PDGFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aurora,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p53,0.0,0.0,0.0,10.0,1.11,0.12,10.0,1.11,0.08,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,ASG002_U251MG_24H:A10,ASG002_U251MG_24H:A11,ASG002_U251MG_24H:A12,ASG002_U251MG_24H:B10,ASG002_U251MG_24H:B11,ASG002_U251MG_24H:B12,ASG002_U251MG_24H:B13,ASG002_U251MG_24H:B14,ASG002_U251MG_24H:B15,ASG002_U251MG_24H:C13,...,ASG002_U251MG_24H:M24,ASG002_U251MG_24H:N22,ASG002_U251MG_24H:N23,ASG002_U251MG_24H:N24,ASG002_U251MG_24H:O10,ASG002_U251MG_24H:O11,ASG002_U251MG_24H:O12,ASG002_U251MG_24H:P16,ASG002_U251MG_24H:P17,ASG002_U251MG_24H:P18
CDK2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK4_6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
PI3K,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Estrogen,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,0,0,0
EGFR,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
Hypoxia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Aurora,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p53,0,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


## Global responses for DPD modules

According to our discussion, $R$ for DPD vectors can not be calculated with the same formula as for pathway activities. Instead we are assuming:

\begin{equation}
R_{DPD},j = DPD = STV_{DPD} \cdot Data_j
\end{equation}

In [18]:
os.path.join(data_dir,'ALL_DATA_2020_Jing_u251mg.xlsx')

'/home/jing/Phd_project/project_GBM/gbm_Scripts/gbm_Scripts_bmra_u251mg/00_outputs_2020_U251MG/ALL_DATA_2020_Jing_u251mg.xlsx'

In [19]:
# load STV data frame
STVs = pd.read_excel(os.path.join(data_dir,'ALL_DATA_2020_Jing_u251mg.xlsx'), sheet_name = "STVs", index_col = 0)

STV_df = pd.DataFrame(np.zeros((len(Data_norm_df.index), 2)), index = Data_norm_df.index, columns = STVs.columns)
STV_df.loc[STVs.index] = STVs

display(STV_df)

Unnamed: 0,GBM_onc,GBM_survival
AARS,-0.018282,-1.228394
ABCB6,0.000000,0.000000
ABCC5,-0.007873,0.000000
ABCF1,-0.011879,-2.215623
ABCF3,0.000000,0.000000
...,...,...
ZNF395,-0.033083,0.000000
ZNF451,0.002199,0.000000
ZNF586,-0.011584,-0.017534
ZNF589,-0.012400,-1.006080


In [20]:
# create empty DPD data frame
DPD_df = pd.DataFrame(
    np.zeros((len(Data_norm_df.columns), len(STV_df.columns))),
    index = Data_norm_df.columns,
    columns = STV_df.columns,
)

# populate
for exp_id in DPD_df.index:
    for state in STV_df.columns:
        DPD_df.loc[exp_id, state] = np.dot(Data_norm_df.T.loc[exp_id], STV_df.loc[:, state])

display(DPD_df)

Unnamed: 0,GBM_onc,GBM_survival
ASG002_U251MG_24H:A10,1.979015,-7.817739
ASG002_U251MG_24H:A11,0.221957,14.055979
ASG002_U251MG_24H:A12,0.379708,1.230610
ASG002_U251MG_24H:B10,0.234524,-3.664586
ASG002_U251MG_24H:B11,1.089820,4.439850
...,...,...
ASG002_U251MG_24H:O11,-0.030839,7.740378
ASG002_U251MG_24H:O12,-0.291152,5.423407
ASG002_U251MG_24H:P16,1.414242,2.783695
ASG002_U251MG_24H:P17,1.510780,9.892769


In [21]:
# transform to R global
R_global_DPD_df = DPD_df.T
display(R_global_DPD_df)

Unnamed: 0,ASG002_U251MG_24H:A10,ASG002_U251MG_24H:A11,ASG002_U251MG_24H:A12,ASG002_U251MG_24H:B10,ASG002_U251MG_24H:B11,ASG002_U251MG_24H:B12,ASG002_U251MG_24H:B13,ASG002_U251MG_24H:B14,ASG002_U251MG_24H:B15,ASG002_U251MG_24H:C13,...,ASG002_U251MG_24H:M24,ASG002_U251MG_24H:N22,ASG002_U251MG_24H:N23,ASG002_U251MG_24H:N24,ASG002_U251MG_24H:O10,ASG002_U251MG_24H:O11,ASG002_U251MG_24H:O12,ASG002_U251MG_24H:P16,ASG002_U251MG_24H:P17,ASG002_U251MG_24H:P18
GBM_onc,1.979015,0.221957,0.379708,0.234524,1.08982,0.639857,0.915347,-0.17355,0.077196,1.728283,...,-0.180166,2.531988,0.033434,0.178615,0.311146,-0.030839,-0.291152,1.414242,1.51078,0.04722
GBM_survival,-7.817739,14.055979,1.23061,-3.664586,4.43985,10.295292,-8.553705,1.156256,-8.301178,-4.424823,...,12.226618,5.378695,-14.370423,-10.126191,1.944904,7.740378,5.423407,2.783695,9.892769,3.311671


## Save outputs

In [22]:
# save metadata as pickle
all_metadata = {
    "modules": modules,
    "n_modules": n_modules,
    "drugs": drugs,
    "n_drugs": n_drugs,
    "exp_ids": exp_ids,
    "n_experiments": n_experiments,
    "genes": genes,
    "n_genes": n_genes,
}

print(all_metadata)

with open(os.path.join(out_dir, "metadata.pickle"), "wb") as f:
    pickle.dump(all_metadata, f, protocol = pickle.HIGHEST_PROTOCOL)

{'modules': ['CDK2', 'CDK4_6', 'PI3K', 'ERK', 'Estrogen', 'EGFR', 'Hypoxia', 'PDGFR', 'Aurora', 'p53'], 'n_modules': 10, 'drugs': ['vorinostat', 'raloxifene', 'dienestrol', 'erlotinib', 'selumetinib', 'taselisib', 'masitinib', 'nutlin-3', 'NVP-BEZ235', 'panobinostat', 'entinostat', 'roscovitine', 'tandutinib', 'vemurafenib', 'LY-294002', 'dabrafenib', 'tozasertib', 'estradiol-cypionate', 'palbociclib', 'imatinib', 'AMG-232', 'AZ-628', 'gefitinib', 'barasertib-HQPA', 'afatinib', 'RITA'], 'n_drugs': 26, 'exp_ids': ['ASG002_U251MG_24H:A10', 'ASG002_U251MG_24H:A11', 'ASG002_U251MG_24H:A12', 'ASG002_U251MG_24H:B10', 'ASG002_U251MG_24H:B11', 'ASG002_U251MG_24H:B12', 'ASG002_U251MG_24H:B13', 'ASG002_U251MG_24H:B14', 'ASG002_U251MG_24H:B15', 'ASG002_U251MG_24H:C13', 'ASG002_U251MG_24H:C14', 'ASG002_U251MG_24H:C15', 'ASG002_U251MG_24H:C16', 'ASG002_U251MG_24H:C17', 'ASG002_U251MG_24H:C18', 'ASG002_U251MG_24H:D10', 'ASG002_U251MG_24H:D11', 'ASG002_U251MG_24H:D12', 'ASG002_U251MG_24H:D22', 'ASG00

In [23]:
# save doses and perturbation matrix
inhib_conc_df.to_csv(os.path.join(out_dir, "inhib_conc_annotated.csv"))
ic50_df.to_csv(os.path.join(out_dir, "ic50_annotated.csv"))
# gamma_df.to_csv(os.path.join(out_dir, "gamma_annotated.csv"))
pert_df.to_csv(os.path.join(out_dir, "pert_annotated.csv"))

In [24]:
# save log fold change L1000 data
Data_norm_df.to_csv(os.path.join(out_dir, "L1000_Data_norm_data.csv"))

In [25]:
# save R_global for DPDs
R_global_DPD_df.to_csv(os.path.join(out_dir, "R_global_DPDonly_annotated.csv"))