# 01 Data preparation

Processing xlsx files from data folder into suitable inputs and generate other input files

In [36]:
import pandas as pd
import numpy as np
import os
import pickle

In [38]:
data_dir = "~/Phd_project/project_GBM/gbm_DATA/gbm_DATA_LINCS_GI1"
info_dir = data_dir

out_dir = "01_outputs_2020"

os.makedirs(out_dir, exist_ok = True)


## Modules

Load data about modules and drugs.

In [4]:
### DATA
### remove whitespaces in names (modules), remove duplicates

modules_df = pd.read_excel(
    os.path.join(info_dir, "ALL_DATA_2020_Jing_gbm.xlsx"), sheet_name = "modules", index_col = 0)
# display(modules_df)

selected_modules = modules_df.index.tolist()
print(len(selected_modules), ' - Size after reading')

# remove duplicates
selected_modules = modules_df.index.unique().tolist()
print(len(selected_modules), ' - Size after remove duplicates')

# remove whitespaces in modules' names 
selected_modules = [d.strip() for d in selected_modules]

print('Selected modules list: ', len(selected_modules), selected_modules)


11  - Size after reading
11  - Size after remove duplicates
Selected modules list:  11 ['CDK1', 'CDK2', 'CDK4_6', 'p53', 'EGFR', 'Aurora', 'Estrogen', 'PDGFR', 'Hypoxia', 'ERK', 'PI3K']


In [10]:
### DATA
### remove whitespaces in names (modules, drugs), remove duplicates
### check the dimensions of the indicator IC50 1 uM = 1000 nM
### copy-paste as values, numbers, no formulas

IC50_df = pd.read_excel(os.path.join(info_dir, "ALL_DATA_2020_Jing_gbm.xlsx"), sheet_name = "IC50s")
IC50_df.drop(columns=['Notes','Removed'],inplace=True)

print(len(IC50_df.index), ' - Size after reading')
# display(IC50_df)

# rename
IC50_df = IC50_df.rename(columns = {"IC50, uM": "IC50"})

# manually correcting value IC50, Example  for IOX2 -> 30 nM  
#IC50_df.loc[IC50_df.index == 'IOX2', IC50_df.columns == 'IC50'] = 30/1000

# remove non-selected modules, modules' names with whitespaces or empty
IC50_df = IC50_df[IC50_df.Module.isin(selected_modules)]

print(len(IC50_df.index), ' - Size after remove modules')
# display(IC50_df)

# remove duplicates 
# Considering certain columns is optional. 
# Indexes, including time indexes are ignored.
IC50_df = IC50_df.drop_duplicates()

print(len(IC50_df.index), ' - Size after remove duplicates')
display(IC50_df)

39  - Size after reading
39  - Size after remove modules
39  - Size after remove duplicates


Unnamed: 0,Drug,Module,IC50
0,JNJ-7706621,CDK1,0.027
1,PHA-793887,CDK1,0.18
2,roscovitine,CDK2,2.0
3,alvocidib,CDK4_6,0.12
4,palbociclib,CDK4_6,0.045
5,AS-605240,PI3K,0.1595
6,GDC-0349,PI3K,3.0
7,LY-294002,PI3K,1.36
8,NVP-BEZ235,PI3K,1.71
9,taselisib,PI3K,0.00262


In [11]:
modules = IC50_df.Module.unique().tolist()

print('IC50_df  modules list: ', len(modules), modules)
print()
print('Selected modules list: ', len(selected_modules), selected_modules)

### CHECK
print()
print('CHECK: ', len(selected_modules),'=?', len(modules))

n_modules = len(modules)


IC50_df  modules list:  11 ['CDK1', 'CDK2', 'CDK4_6', 'PI3K', 'ERK', 'Hypoxia', 'PDGFR', 'Aurora', 'Estrogen', 'EGFR', 'p53']

Selected modules list:  11 ['CDK1', 'CDK2', 'CDK4_6', 'p53', 'EGFR', 'Aurora', 'Estrogen', 'PDGFR', 'Hypoxia', 'ERK', 'PI3K']

CHECK:  11 =? 11


In [12]:
drugs = IC50_df.Drug.tolist()
print(len(drugs), ' - Size after reading')

# remove duplicates
drugs = IC50_df.Drug.unique().tolist()
print(len(drugs), ' - Size after remove duplicates')

# remove whitespaces in drugs' names (necessary for some)
drugs = [d.strip() for d in drugs]

# remove duplicates after remove whitespaces
drugs = list(set(drugs))
print(len(drugs), ' - Size after remove duplicates without whitespaces')

print('Drugs list: ', len(drugs), drugs)

n_drugs = len(drugs)

39  - Size after reading
39  - Size after remove duplicates
39  - Size after remove duplicates without whitespaces
Drugs list:  39 ['selumetinib', 'lapatinib', 'trametinib', 'NVP-BEZ235', 'SAR405838', 'gefitinib', 'BAY-87-2243', 'vemurafenib', 'CAY-10585', 'palbociclib', 'AS-605240', 'GDC-0349', 'tozasertib', 'nutlin-3', 'PI-103', 'HLI-373', 'taselisib', 'estradiol-cypionate', 'erlotinib', 'LY-294002', 'dabrafenib', 'JNJ-7706621', 'masitinib', 'PD-0325901', 'barasertib-HQPA', 'ponatinib', 'tandutinib', 'vandetanib', 'RITA', 'afatinib', 'roscovitine', 'PHA-793887', 'raloxifene', 'AMG-232', 'dienestrol', 'alvocidib', 'serdemetan', 'AZ-628', 'imatinib']


## L1000 meta data

Get sig_id for selected drugs.

In [14]:
sig_info_df = pd.read_excel(os.path.join(data_dir, "sig_info_2020_GI1.xlsx"), index_col = 0)

display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_GI1_24H:A03,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A03,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_GI1_24H:A04,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A04,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_GI1_24H:A05,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A05,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_GI1_24H:A06,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A06,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
ASG002_GI1_24H:J13,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:J13,1,ctl_vehicle,DMSO,DMSO_No_target,0,0 uM,0.00
...,...,...,...,...,...,...,...,...,...,...,...
MOAR012_GI1_24H:P20,GI1,MOAR012,24 h,MOAR012_GI1_24H_X1_B36:P20,1,trt_cp,BAY-61-3606,,0,3.33 uM,3.33
MOAR012_GI1_24H:P21,GI1,MOAR012,24 h,MOAR012_GI1_24H_X1_B36:P21,1,trt_cp,BAY-61-3606,,0,1.11 uM,1.11
MOAR012_GI1_24H:P22,GI1,MOAR012,24 h,MOAR012_GI1_24H_X1_B36:P22,1,trt_cp,ethaverine,,0,10 uM,10.00
MOAR012_GI1_24H:P23,GI1,MOAR012,24 h,MOAR012_GI1_24H_X1_B36:P23,1,trt_cp,ethaverine,,0,3.33 uM,3.33


In [15]:
# now filtering so only the required drugs are present
sig_info_df = sig_info_df.loc[sig_info_df.pert_drug.isin(drugs)]

# here's what we have now
display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASG002_GI1_24H:A10,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A10,1,trt_cp,taselisib,PIK3CA,1,10 uM,10.00
ASG002_GI1_24H:A11,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A11,1,trt_cp,taselisib,PIK3CA,1,1.11 uM,1.11
ASG002_GI1_24H:A12,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A12,1,trt_cp,taselisib,PIK3CA,1,0.12 uM,0.12
ASG002_GI1_24H:A19,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A19,1,trt_cp,AS-605240,PIK3CG,1,10 uM,10.00
ASG002_GI1_24H:A20,GI1,ASG002,24 h,ASG002_GI1_24H_X1_B35:A20,1,trt_cp,AS-605240,PIK3CG,1,1.11 uM,1.11
...,...,...,...,...,...,...,...,...,...,...,...
MOAR010_GI1_24H:K08,GI1,MOAR010,24 h,MOAR010_GI1_24H_X1_B36:K08,1,trt_cp,SAR405838,MDM2,1,3.33 uM,3.33
MOAR010_GI1_24H:K09,GI1,MOAR010,24 h,MOAR010_GI1_24H_X1_B36:K09,1,trt_cp,SAR405838,MDM2,1,1.11 uM,1.11
MOAR011_GI1_24H:F07,GI1,MOAR011,24 h,MOAR011_GI1_24H_X1_B36:F07,1,trt_cp,serdemetan,MDM2,1,10 uM,10.00
MOAR011_GI1_24H:F08,GI1,MOAR011,24 h,MOAR011_GI1_24H_X1_B36:F08,1,trt_cp,serdemetan,MDM2,1,3.33 uM,3.33


Manually remove  few inhibition from data set, since it does differ from the other data points.

inhib_to_filter = "PF-03758309"
dose_to_filter = 10

id_to_filter = sig_info_df[
    np.logical_and(
        sig_info_df.pert_drug == inhib_to_filter,
        sig_info_df.dose_float == dose_to_filter,
    )
].index.values

print(id_to_filter)

sig_info_df = sig_info_df[~sig_info_df.index.isin(id_to_filter)]
display(sig_info_df)

inhib_to_filter = "roscovitine"
dose_to_filter = 3.33

id_to_filter = sig_info_df[
    np.logical_and(
        sig_info_df.pert_drug == inhib_to_filter,
        sig_info_df.dose_float == dose_to_filter,
    )
].index.values

print(id_to_filter)

sig_info_df = sig_info_df[~sig_info_df.index.isin(id_to_filter)]
display(sig_info_df)

In [16]:
exp_ids = sig_info_df.index.unique().tolist()
print('Experiments ids list: ', len(exp_ids))

n_experiments = len(exp_ids)

Experiments ids list:  119


Confirm data by checking the drugs of interest against the filtered L1000 meta data.

In [17]:
print(f"Number of drugs of interest:\t{len(drugs)}")
#print(f'Number of drugs in L1000 data:\t{len(sig_info_df.value_counts("drugs"))}')

#sig_info_df.value_counts("drugs")

Number of drugs of interest:	39


## L1000 data

In [18]:
Data_norm_df = pd.read_excel(os.path.join(data_dir, "Data_norm_2020_GI1.xlsx"), index_col = 0)
display(Data_norm_df)

Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
ASG002_GI1_24H:A03,0.014285,0.302692,0.124004,0.005246,0.340042,0.276465,0.127250,0.328650,0.627212,-0.220684,...,-0.102669,0.893785,0.505962,-0.357469,0.517504,0.255096,0.008023,-0.144581,-0.187158,-0.225612
ASG002_GI1_24H:A04,0.084985,0.276642,0.324054,-0.016954,0.079942,0.155465,-0.100200,0.345250,0.183511,-0.080484,...,-0.057569,-0.443416,0.094212,-0.303270,0.271754,-0.176304,-0.028377,0.017819,-0.187858,-0.157212
ASG002_GI1_24H:A05,-0.019615,0.183842,0.282254,-0.577354,0.316943,0.109765,0.117800,0.301150,-0.187188,0.275415,...,0.082731,-0.124365,0.141711,-0.221269,0.206753,0.215946,0.000723,0.130619,-0.080258,0.012289
ASG002_GI1_24H:A06,0.324884,0.146642,0.192004,0.043446,0.141842,-0.053335,-0.028000,-0.580550,0.622812,-0.080985,...,0.246530,0.644485,0.082212,-0.069869,0.142254,0.576146,0.092923,-0.041681,-0.253158,-0.221112
ASG002_GI1_24H:J13,0.182484,0.060892,-0.044696,-0.346554,-0.158858,-0.031434,0.068500,0.372950,-0.366439,-0.176334,...,-0.204769,-0.254016,0.310211,-0.177470,-0.026896,0.068946,0.070023,-0.011181,-0.366158,0.121888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MOAR012_GI1_24H:P20,0.844125,0.359950,0.014575,0.002151,0.108225,-0.232287,0.331775,0.190337,-0.644337,-0.320262,...,-0.223550,0.344850,-0.421962,-0.379925,0.154400,0.031300,-0.095100,-0.317588,0.103650,1.095600
MOAR012_GI1_24H:P21,0.346174,0.102800,-0.449275,-0.206150,0.500625,-0.383837,0.456875,0.125388,-0.490437,-0.240913,...,-0.271100,-0.497100,-0.194638,-0.082924,-0.450300,-0.088776,0.427825,-0.431062,-0.064025,1.196051
MOAR012_GI1_24H:P22,0.195774,0.285975,-0.675725,-0.243999,0.197526,0.238638,-0.013250,-0.236763,-0.761688,-0.048863,...,-0.190250,-0.331850,-0.556237,-0.161175,-0.274225,0.178549,0.110350,0.389612,0.043350,1.345251
MOAR012_GI1_24H:P23,0.093424,0.179075,-0.400900,-0.337299,0.084625,-0.015337,-0.073375,-0.276812,0.107612,0.054488,...,0.203775,-0.216850,-1.163637,-0.258825,-0.483600,0.067650,0.465775,0.345462,1.912200,1.386475


In [19]:
Data_norm_df = Data_norm_df[Data_norm_df.index.isin(exp_ids)]

# arrange experiments in same order as in list
Data_norm_df["sort_col"] = Data_norm_df.index.map({val: i for i, val in enumerate(exp_ids)})
Data_norm_df = Data_norm_df.sort_values("sort_col")
Data_norm_df = Data_norm_df.drop("sort_col", axis = 1)

# transpose
Data_norm_df = Data_norm_df.T

display(Data_norm_df)

Unnamed: 0,ASG002_GI1_24H:A10,ASG002_GI1_24H:A11,ASG002_GI1_24H:A12,ASG002_GI1_24H:A19,ASG002_GI1_24H:A20,ASG002_GI1_24H:A21,ASG002_GI1_24H:B10,ASG002_GI1_24H:B11,ASG002_GI1_24H:B12,ASG002_GI1_24H:B13,...,MOAR010_GI1_24H:J05,MOAR010_GI1_24H:J06,MOAR010_GI1_24H:J22,MOAR010_GI1_24H:J23,MOAR010_GI1_24H:J24,MOAR010_GI1_24H:K08,MOAR010_GI1_24H:K09,MOAR011_GI1_24H:F07,MOAR011_GI1_24H:F08,MOAR011_GI1_24H:F09
AARS,0.523185,0.938285,0.505484,0.081884,0.098285,-0.151566,0.710285,0.296184,0.228184,0.375385,...,-0.032493,0.168907,0.270007,0.061257,-0.217792,0.220807,0.085657,0.755200,0.644751,0.386250
ABCB6,0.118442,0.227142,0.304742,-0.382158,0.238542,0.155942,0.298642,-0.079358,-0.191258,-0.339458,...,-0.032753,0.717647,-0.064154,-0.282554,0.701746,-0.324954,-0.165654,0.156125,-0.260724,-0.083525
ABCC5,0.564554,0.440104,0.225354,0.310204,0.294754,0.167454,0.163604,0.064704,-0.205796,-0.087596,...,-0.141568,3.141582,0.046332,0.205632,-0.191818,-0.105018,-0.341468,-0.290800,-0.169550,0.507650
ABCF1,0.523446,-0.058954,-0.155854,-0.258454,-0.506854,-0.501254,0.398046,0.261746,-0.216254,0.271546,...,-0.846861,0.155640,-0.473761,0.470839,0.442039,-0.860960,0.148239,0.187650,0.160300,0.407700
ABCF3,-0.166158,0.369043,-0.040257,0.078542,0.186543,0.182892,0.087393,0.129543,0.064943,0.228443,...,0.048004,-0.268696,-0.489496,-0.467897,-0.008397,-0.144396,0.169904,-0.211125,-0.059576,-0.080724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF395,2.884646,1.014946,2.148046,0.932246,1.025796,1.076846,-0.055353,0.256146,1.335346,-0.203454,...,-0.168449,0.886350,0.140201,-0.628750,0.597250,0.437050,0.595250,2.233175,-0.372476,-0.242925
ZNF451,0.404223,0.016124,-0.002077,0.001373,-0.050777,-0.317177,0.194573,-0.095727,-0.037976,-0.088576,...,-0.026390,0.081061,0.124911,-0.100090,0.083011,0.018010,-0.145090,0.340900,-0.283650,0.061800
ZNF586,-0.524031,0.024019,-0.102681,0.016219,0.151019,-0.282081,-0.198481,0.010019,0.029219,-0.102681,...,0.118679,0.030179,-0.185021,-0.234871,-0.191371,-0.170221,0.122979,0.070525,-0.108375,0.071375
ZNF589,0.094642,-0.020358,0.035642,0.198992,-0.182708,-0.185608,-0.213058,-0.050708,-0.067658,-0.338058,...,-0.057418,0.099632,-0.262168,0.054632,-0.333418,0.193382,0.000232,0.172925,-0.146675,0.158125


In [20]:
genes = Data_norm_df.index.tolist()
print('Landmark genes list: ', len(genes), genes)

n_genes = len(genes)

Landmark genes list:  978 ['AARS', 'ABCB6', 'ABCC5', 'ABCF1', 'ABCF3', 'ABHD4', 'ABHD6', 'ABL1', 'ACAA1', 'ACAT2', 'ACBD3', 'ACD', 'ACLY', 'ACOT9', 'ADAM10', 'ADAT1', 'ADGRE5', 'ADGRG1', 'ADH5', 'ADI1', 'ADO', 'ADRB2', 'AGL', 'AKAP8', 'AKAP8L', 'AKR7A2', 'AKT1', 'ALAS1', 'ALDH7A1', 'ALDOA', 'ALDOC', 'AMDHD2', 'ANKRD10', 'ANO10', 'ANXA7', 'APBB2', 'APOE', 'APP', 'APPBP2', 'ARFIP2', 'ARHGAP1', 'ARHGEF12', 'ARHGEF2', 'ARID4B', 'ARID5B', 'ARL4C', 'ARNT2', 'ARPP19', 'ASAH1', 'ASCC3', 'ATF1', 'ATF5', 'ATF6', 'ATG3', 'ATMIN', 'ATP11B', 'ATP1B1', 'ATP2C1', 'ATP6V0B', 'ATP6V1D', 'AURKA', 'AURKB', 'AXIN1', 'B4GAT1', 'BACE2', 'BAD', 'BAG3', 'BAMBI', 'BAX', 'BCL2', 'BCL7B', 'BDH1', 'BECN1', 'BHLHE40', 'BID', 'BIRC2', 'BIRC5', 'BLCAP', 'BLMH', 'BLVRA', 'BMP4', 'BNIP3', 'BNIP3L', 'BPHL', 'BRCA1', 'BTK', 'BUB1B', 'BZW2', 'C2CD2', 'C2CD2L', 'C2CD5', 'C5', 'CAB39', 'CALM3', 'CALU', 'CAMSAP2', 'CANT1', 'CAPN1', 'CARMIL1', 'CASC3', 'CASK', 'CASP10', 'CASP2', 'CASP3', 'CASP7', 'CAST', 'CAT', 'CBLB', 'CBR1

## Inhibitor concentrations, IC50, and perturbation matrices

In [21]:
inhib_conc_matrix = np.zeros((n_modules, n_experiments))
ic50_matrix = np.ones((n_modules, n_experiments))
gamma_matrix = np.zeros((n_modules, n_experiments))

In [22]:

for i, module in enumerate(modules):
    drugs_for_module = IC50_df.Drug[IC50_df.Module == module].tolist()
    for drug in drugs_for_module:
        # get IC50 for this drug
        ic50 = IC50_df.IC50[IC50_df.Drug == drug].values
#       gamma = IC50_df.Gamma[IC50_df.Drug == drug].values
        print(drug, ic50)
        assert ic50.size == 1
#       assert gamma.size == 1
        # get experiments with this drug
        exp_with_drug = sig_info_df.index[sig_info_df.pert_drug == drug].tolist()
        print(exp_with_drug) 
        for exp_id in exp_with_drug:
            j = exp_ids.index(exp_id)
            print(j)
            # extract inhibitor concentration
            inhib_conc = sig_info_df.dose_float[sig_info_df.index == exp_id].values
            assert inhib_conc.size == 1
            # insert values in matrices
            inhib_conc_matrix[i, j] = inhib_conc.item()
            ic50_matrix[i, j] = ic50.item()
#           gamma_matrix[i, j] = gamma.item()


JNJ-7706621 [0.027]
['ASG002_GI1_24H:N13', 'ASG002_GI1_24H:N14', 'ASG002_GI1_24H:N15']
73
74
75
PHA-793887 [0.18]
['ASG002_GI1_24H:L01', 'ASG002_GI1_24H:L02', 'ASG002_GI1_24H:L03']
58
59
60
roscovitine [2.]
['ASG002_GI1_24H:E22', 'ASG002_GI1_24H:E23', 'ASG002_GI1_24H:E24']
22
23
24
alvocidib [0.12]
['ASG002_GI1_24H:F04', 'ASG002_GI1_24H:F05', 'ASG002_GI1_24H:F06']
28
29
30
palbociclib [0.045]
['ASG002_GI1_24H:P16', 'ASG002_GI1_24H:P17', 'ASG002_GI1_24H:P18']
90
91
92
AS-605240 [0.1595]
['ASG002_GI1_24H:A19', 'ASG002_GI1_24H:A20', 'ASG002_GI1_24H:A21']
3
4
5
GDC-0349 [3.]
['MOAR010_GI1_24H:D01', 'MOAR010_GI1_24H:D02', 'MOAR010_GI1_24H:D03']
105
106
107
LY-294002 [1.36]
['ASG002_GI1_24H:F01', 'ASG002_GI1_24H:F02', 'ASG002_GI1_24H:F03']
25
26
27
NVP-BEZ235 [1.71]
['ASG002_GI1_24H:C16', 'ASG002_GI1_24H:C17']
14
15
taselisib [0.00262]
['ASG002_GI1_24H:A10', 'ASG002_GI1_24H:A11', 'ASG002_GI1_24H:A12']
0
1
2
PI-103 [0.09]
['ASG002_GI1_24H:O22', 'ASG002_GI1_24H:O23', 'ASG002_GI1_24H:O24']
87
8

In [23]:
# transform matrices into pandas dfs for export with row and column names
inhib_conc_df = pd.DataFrame(inhib_conc_matrix, index = modules, columns = exp_ids)
ic50_df = pd.DataFrame(ic50_matrix, index = modules, columns = exp_ids)
# gamma_df = pd.DataFrame(gamma_matrix, index = modules, columns = exp_ids)

# create binary perturbation matrix
pert_df = pd.DataFrame(
    np.where(inhib_conc_matrix != 0, 1, 0),
    index = inhib_conc_df.index,
    columns = inhib_conc_df.columns,
)

In [24]:
display(ic50_df)
# display(gamma_df)
display(inhib_conc_df)
display(pert_df)

Unnamed: 0,ASG002_GI1_24H:A10,ASG002_GI1_24H:A11,ASG002_GI1_24H:A12,ASG002_GI1_24H:A19,ASG002_GI1_24H:A20,ASG002_GI1_24H:A21,ASG002_GI1_24H:B10,ASG002_GI1_24H:B11,ASG002_GI1_24H:B12,ASG002_GI1_24H:B13,...,MOAR010_GI1_24H:J05,MOAR010_GI1_24H:J06,MOAR010_GI1_24H:J22,MOAR010_GI1_24H:J23,MOAR010_GI1_24H:J24,MOAR010_GI1_24H:K08,MOAR010_GI1_24H:K09,MOAR011_GI1_24H:F07,MOAR011_GI1_24H:F08,MOAR011_GI1_24H:F09
CDK1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDK2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDK4_6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PI3K,0.00262,0.00262,0.00262,0.1595,0.1595,0.1595,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Hypoxia,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,13.2,13.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PDGFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Aurora,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Estrogen,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
EGFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,ASG002_GI1_24H:A10,ASG002_GI1_24H:A11,ASG002_GI1_24H:A12,ASG002_GI1_24H:A19,ASG002_GI1_24H:A20,ASG002_GI1_24H:A21,ASG002_GI1_24H:B10,ASG002_GI1_24H:B11,ASG002_GI1_24H:B12,ASG002_GI1_24H:B13,...,MOAR010_GI1_24H:J05,MOAR010_GI1_24H:J06,MOAR010_GI1_24H:J22,MOAR010_GI1_24H:J23,MOAR010_GI1_24H:J24,MOAR010_GI1_24H:K08,MOAR010_GI1_24H:K09,MOAR011_GI1_24H:F07,MOAR011_GI1_24H:F08,MOAR011_GI1_24H:F09
CDK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK4_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PI3K,10.0,1.11,0.12,10.0,1.11,0.12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Hypoxia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.33,1.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PDGFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aurora,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Estrogen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EGFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,ASG002_GI1_24H:A10,ASG002_GI1_24H:A11,ASG002_GI1_24H:A12,ASG002_GI1_24H:A19,ASG002_GI1_24H:A20,ASG002_GI1_24H:A21,ASG002_GI1_24H:B10,ASG002_GI1_24H:B11,ASG002_GI1_24H:B12,ASG002_GI1_24H:B13,...,MOAR010_GI1_24H:J05,MOAR010_GI1_24H:J06,MOAR010_GI1_24H:J22,MOAR010_GI1_24H:J23,MOAR010_GI1_24H:J24,MOAR010_GI1_24H:K08,MOAR010_GI1_24H:K09,MOAR011_GI1_24H:F07,MOAR011_GI1_24H:F08,MOAR011_GI1_24H:F09
CDK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK4_6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PI3K,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hypoxia,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
PDGFR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Aurora,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Estrogen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
EGFR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Global responses for DPD modules

According to our discussion, $R$ for DPD vectors can not be calculated with the same formula as for pathway activities. Instead we are assuming:

\begin{equation}
R_{DPD},j = DPD = STV_{DPD} \cdot Data_j
\end{equation}

In [27]:
# load STV data frame
STVs = pd.read_excel(os.path.join(info_dir, "ALL_DATA_2020_Jing_gbm.xlsx"), sheet_name = "STVs", index_col = 0)
STV_df = pd.DataFrame(np.zeros((len(Data_norm_df.index), 2)), index = Data_norm_df.index, columns = STVs.columns)
STV_df.loc[STVs.index] = STVs

display(STV_df)

Unnamed: 0,GBM_onc,GBM_survival
AARS,-0.018282,0.000000
ABCB6,0.000000,0.000000
ABCC5,-0.007873,0.000000
ABCF1,-0.011879,-2.215623
ABCF3,0.000000,0.000000
...,...,...
ZNF395,-0.033083,0.000000
ZNF451,0.002199,0.000000
ZNF586,-0.011584,0.000000
ZNF589,-0.012400,0.000000


In [28]:
# create empty DPD data frame
DPD_df = pd.DataFrame(
    np.zeros((len(Data_norm_df.columns), len(STV_df.columns))),
    index = Data_norm_df.columns,
    columns = STV_df.columns,
)

# populate
for exp_id in DPD_df.index:
    for state in STV_df.columns:
        DPD_df.loc[exp_id, state] = np.dot(Data_norm_df.T.loc[exp_id], STV_df.loc[:, state])

display(DPD_df)

Unnamed: 0,GBM_onc,GBM_survival
ASG002_GI1_24H:A10,1.363210,-4.750028
ASG002_GI1_24H:A11,-0.326099,-2.525002
ASG002_GI1_24H:A12,-0.908391,-1.395286
ASG002_GI1_24H:A19,0.750205,0.551730
ASG002_GI1_24H:A20,0.936970,1.213057
...,...,...
MOAR010_GI1_24H:K08,1.082215,0.640095
MOAR010_GI1_24H:K09,-0.076825,-4.025604
MOAR011_GI1_24H:F07,-1.366585,-2.966102
MOAR011_GI1_24H:F08,-0.001349,-5.883910


In [29]:
# transform to R global
R_global_DPD_df = DPD_df.T
display(R_global_DPD_df)

Unnamed: 0,ASG002_GI1_24H:A10,ASG002_GI1_24H:A11,ASG002_GI1_24H:A12,ASG002_GI1_24H:A19,ASG002_GI1_24H:A20,ASG002_GI1_24H:A21,ASG002_GI1_24H:B10,ASG002_GI1_24H:B11,ASG002_GI1_24H:B12,ASG002_GI1_24H:B13,...,MOAR010_GI1_24H:J05,MOAR010_GI1_24H:J06,MOAR010_GI1_24H:J22,MOAR010_GI1_24H:J23,MOAR010_GI1_24H:J24,MOAR010_GI1_24H:K08,MOAR010_GI1_24H:K09,MOAR011_GI1_24H:F07,MOAR011_GI1_24H:F08,MOAR011_GI1_24H:F09
GBM_onc,1.36321,-0.326099,-0.908391,0.750205,0.93697,0.590187,-0.063955,0.814936,-0.164436,-0.079985,...,-0.231234,-0.713071,-0.340106,-0.358414,0.337871,1.082215,-0.076825,-1.366585,-0.001349,-0.199769
GBM_survival,-4.750028,-2.525002,-1.395286,0.55173,1.213057,0.315073,-2.571901,-5.538101,-0.004049,-5.594971,...,-3.623248,0.118619,-2.316815,-5.99843,-5.552708,0.640095,-4.025604,-2.966102,-5.88391,-1.90015


## Save outputs

In [39]:
# save metadata as pickle
all_metadata = {
    "modules": modules,
    "n_modules": n_modules,
    "drugs": drugs,
    "n_drugs": n_drugs,
    "exp_ids": exp_ids,
    "n_experiments": n_experiments,
    "genes": genes,
    "n_genes": n_genes,
}

print(all_metadata)

with open(os.path.join(out_dir, "metadata.pickle"), "wb") as f:
    pickle.dump(all_metadata, f, protocol = pickle.HIGHEST_PROTOCOL)

{'modules': ['CDK1', 'CDK2', 'CDK4_6', 'PI3K', 'ERK', 'Hypoxia', 'PDGFR', 'Aurora', 'Estrogen', 'EGFR', 'p53'], 'n_modules': 11, 'drugs': ['selumetinib', 'lapatinib', 'trametinib', 'NVP-BEZ235', 'SAR405838', 'gefitinib', 'BAY-87-2243', 'vemurafenib', 'CAY-10585', 'palbociclib', 'AS-605240', 'GDC-0349', 'tozasertib', 'nutlin-3', 'PI-103', 'HLI-373', 'taselisib', 'estradiol-cypionate', 'erlotinib', 'LY-294002', 'dabrafenib', 'JNJ-7706621', 'masitinib', 'PD-0325901', 'barasertib-HQPA', 'ponatinib', 'tandutinib', 'vandetanib', 'RITA', 'afatinib', 'roscovitine', 'PHA-793887', 'raloxifene', 'AMG-232', 'dienestrol', 'alvocidib', 'serdemetan', 'AZ-628', 'imatinib'], 'n_drugs': 39, 'exp_ids': ['ASG002_GI1_24H:A10', 'ASG002_GI1_24H:A11', 'ASG002_GI1_24H:A12', 'ASG002_GI1_24H:A19', 'ASG002_GI1_24H:A20', 'ASG002_GI1_24H:A21', 'ASG002_GI1_24H:B10', 'ASG002_GI1_24H:B11', 'ASG002_GI1_24H:B12', 'ASG002_GI1_24H:B13', 'ASG002_GI1_24H:B15', 'ASG002_GI1_24H:C13', 'ASG002_GI1_24H:C14', 'ASG002_GI1_24H:C15'

In [40]:
# save doses and perturbation matrix
inhib_conc_df.to_csv(os.path.join(out_dir, "inhib_conc_annotated.csv"))
ic50_df.to_csv(os.path.join(out_dir, "ic50_annotated.csv"))
# gamma_df.to_csv(os.path.join(out_dir, "gamma_annotated.csv"))
pert_df.to_csv(os.path.join(out_dir, "pert_annotated.csv"))

In [42]:
# save log fold change L1000 data
Data_norm_df.to_csv(os.path.join(out_dir, "L1000_Data_norm_data.csv"))

In [43]:
# save R_global for DPDs
R_global_DPD_df.to_csv(os.path.join(out_dir, "R_global_DPDonly_annotated.csv"))