# Exploration and checking things out

Imports and setup

In [1]:
import pandas as pd
import numpy as np
import time
import pickle
import dask.dataframe as dd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import time
import os
import pickle
import random
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import sys
# Sklearn imports
from scipy.stats import pearsonr
from sklearn.linear_model import ElasticNet
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn import feature_selection
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

% matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
sys.path.insert(0, "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Projects/Created Modules")

from gdsc_projects_module import DrugWithDrugBank

### Load data

In [3]:
# Initialize proper file pathways
drug_annotations = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Drug annotations/Screened_Compounds-March_27th_2018.xlsx"
cell_line_list = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Cell line list (directly from website)/Cell_listThu Aug 16 22_06_49 2018.csv"
gene_expr = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Gene expression/sanger1018_brainarray_ensemblgene_rma-March_2nd_2017.txt"
cnv1 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/cnv_binary_1.csv"
cnv2 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/PANCANCER_Genetic_feature_cna_Mon Aug  6 16_18_51 2018 (kopia).csv"
coding_variants = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Mutation calls/PANCANCER_Genetic_feature_variant_Mon Aug  6 15_45_44 2018.csv"
drug_response = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Sensitivity profiles/v17.3_fitted_dose_response-March_27th_2018.xlsx"

# Filepath to dictionary with targets derived from DrugBank
drugbank_targets = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/DrugBank/Created data/drugbank_map_drug_to_targets.p"

# Filepath to gene expression signatures provided by Merck
signatures = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Created data/Merck Gene Expression Signatures/Data/SignatureScores_GDSC-cellLines_2018-09-27.tsv"

# Call loading function from DrugWithDrugBank class
(drug_annotations_df, cell_lines_list_df, gene_expression_df, cnv_binary_df, 
 coding_variants_df, drug_response_df, map_drugs_to_drugbank_targets) = DrugWithDrugBank.load_data(
    drug_annotations, cell_line_list, gene_expr, 
    cnv1, cnv2, coding_variants, drug_response, drugbank_targets)

# Load gene expression signatures
signatures_df = pd.read_table(signatures)

# Load gene mappings
filepath1 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Projects/GDSC - Prediction only with data related to nominal drug targets (minimal approach)/Created data/mapping_from_ensembl_id_to_hgnc_symbol.p"
filepath2 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Projects/GDSC - Prediction only with data related to nominal drug targets (minimal approach)/Created data/mapping_from_hgnc_symbol_to_ensembl_id.p"
DrugWithDrugBank.load_mappings(filepath2, filepath1)   # Initialize class variables

# Print shapes of created DataFrames
print("Loading summary:")
print("Drug annotations:", drug_annotations_df.shape)
print("Cell line list", cell_lines_list_df.shape)
print("Gene expression", gene_expression_df.shape)
print("CNV binary:", cnv_binary_df.shape)
print("Coding variants:", coding_variants_df.shape)
print("Drug response:", drug_response_df.shape)
print("DrugBank mapping (number of matched drugs):", len(map_drugs_to_drugbank_targets))
print("Gene expression signatures:", signatures_df.shape)

Loading summary:
Drug annotations: (267, 5)
Cell line list (1065, 6)
Gene expression (17737, 1019)
CNV binary: (419050, 9)
Coding variants: (295740, 9)
Drug response: (224202, 13)
DrugBank mapping (number of matched drugs): 88
Gene expression signatures: (128, 1018)


Initialize dictionary with DrugWithDrugBank objects

In [4]:
drugs = DrugWithDrugBank.create_drugs(drug_annotations_df, map_drugs_to_drugbank_targets)
print(len(drugs))

267


Check out methods for extracting signatures data

In [5]:
signatures_df.head(3)

Unnamed: 0,X906826,X687983,X910927,X1240138,X1240139,X906792,X910688,X1240135,X1290812,X907045,...,X753584,X907044,X998184,X908145,X1659787,X1298157,X1480372,X1298533,X930299,X905954.1
Kannengiesser.BRAF.signature,0.005223,-0.407311,-0.008919,-0.047516,0.010615,1.284609,-0.43755,-0.036142,-0.025652,-0.110034,...,0.262133,-0.454797,-0.34868,0.688249,0.341071,-0.525661,0.119043,-0.439609,0.190384,1.333843
IFN_signature,-0.374431,-0.31775,-0.58118,0.124181,-0.446854,-0.293723,1.272982,0.146643,-0.429662,-0.68883,...,-0.08782,0.862839,-0.300738,0.28728,-0.379386,-0.150595,-0.265985,1.296345,-0.409257,-0.198096
KinetochoreNet,0.166935,0.22106,0.019868,-0.459931,0.069661,-0.133753,-0.959513,-0.465274,-0.540824,-0.35974,...,0.102691,0.467639,0.503359,-0.461401,0.093911,0.064927,0.054159,-0.467053,0.060052,-0.429826


In [6]:
drug = drugs[11]

In [7]:
drug

Drug(11, "Paclitaxel", ['TUBB1', 'BCL2', 'NR1I2', 'MAP4', 'Microtubule stabiliser', 'MAP2', 'MAPT'], "Mitosis")

In [8]:
data_types = ["tissue", "merck signatures"]
df = drug.create_full_data(drug_response_df, cell_line_list=cell_lines_list_df, 
                           merck_signatures_df=signatures_df,
                          data_combination=data_types)

In [9]:
print(df.shape, drug.full_data.shape)

(399, 143)

In [None]:
print(drug.merck_signatures.shape, drug.tissue_data.shape, drug.full_data.shape)

In [11]:
data_types = ["tissue", "merck signatures"]
df = drug.return_full_data(drug_response_df, cell_line_list=cell_lines_list_df, 
                           merck_signatures_df=signatures_df,
                          data_combination=data_types)

(399, 129)

In [None]:
df.shape

In [None]:
print(drug.merck_signatures.shape, drug.tissue_data.shape, drug.full_data.shape)

In [None]:
print(drug.merck_signatures, drug.tissue_data, drug.full_data)