# Perform machine learning experiments with different settings
Check out predictive performance using nominal targets. Relevant genes come directly from GDSC as well as DrugBank.

Imports and setup

In [1]:
# General imports
import pandas as pd
import numpy as np
import time
import random
import warnings
import sys
import collections
import dill
# Sklearn imports
from sklearn.base import clone
from scipy.stats import pearsonr
from sklearn.linear_model import ElasticNet
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn import feature_selection
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

warnings.filterwarnings('ignore')

Add directory to sys.path in order to import custom modules from there.

In [2]:
sys.path.insert(0, "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Projects/Created Modules")
# Import desired utilities
from gdsc_projects_module import DrugWithDrugBank, Experiment

### Load data

In [3]:
# Initialize proper file pathways
drug_annotations = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Drug annotations/Screened_Compounds-March_27th_2018.xlsx"
cell_line_list = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Cell line list (directly from website)/Cell_listThu Aug 16 22_06_49 2018.csv"
gene_expr = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Gene expression/sanger1018_brainarray_ensemblgene_rma-March_2nd_2017.txt"
cnv1 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/cnv_binary_1.csv"
cnv2 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/PANCANCER_Genetic_feature_cna_Mon Aug  6 16_18_51 2018 (kopia).csv"
coding_variants = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Mutation calls/PANCANCER_Genetic_feature_variant_Mon Aug  6 15_45_44 2018.csv"
drug_response = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Sensitivity profiles/v17.3_fitted_dose_response-March_27th_2018.xlsx"

# Load dictionary with targets derived from DrugBank
drugbank_targets = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/DrugBank/Created data/drugbank_map_drug_to_targets.p"

# Filepath to gene expression signatures provided by Merck
signatures = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Created data/Merck Gene Expression Signatures/Data/SignatureScores_GDSC-cellLines_2018-09-27.tsv"

# Call loading function from DrugWithDrugBank class
(drug_annotations_df, cell_lines_list_df, gene_expression_df, cnv_binary_df, 
 coding_variants_df, drug_response_df, map_drugs_to_drugbank_targets) = DrugWithDrugBank.load_data(
    drug_annotations, cell_line_list, gene_expr, 
    cnv1, cnv2, coding_variants, drug_response, drugbank_targets)

# Load gene expression signatures
signatures_df = pd.read_table(signatures)

# Load helper dict for extraction of CNV data
filepath = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/Created data/"
with open(filepath + "map_cl_id_and_genetic_feature_to_mutation_status.pkl", "rb") as f:
    map_from_cl_id_and_genetic_feature_to_mutation_status = dill.load(f)

# Load gene mappings
filepath1 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Projects/GDSC - Prediction only with data related to nominal drug targets (minimal approach)/Created data/mapping_from_ensembl_id_to_hgnc_symbol.p"
filepath2 = "/media/krzysztof/Nowy/Doktorat - Modelling drug efficacy in cancer/Projects/GDSC - Prediction only with data related to nominal drug targets (minimal approach)/Created data/mapping_from_hgnc_symbol_to_ensembl_id.p"
DrugWithDrugBank.load_mappings(filepath2, filepath1)   # Initialize class variables

# Print shapes of created DataFrames
print("Loading summary:")
print("Drug annotations:", drug_annotations_df.shape)
print("Cell line list", cell_lines_list_df.shape)
print("Gene expression", gene_expression_df.shape)
print("CNV binary:", cnv_binary_df.shape)
print("Coding variants:", coding_variants_df.shape)
print("Drug response:", drug_response_df.shape)
print("DrugBank mapping (number of matched drugs):", len(map_drugs_to_drugbank_targets))
print("Gene expression signatures:", signatures_df.shape)
print("Number of entries in mapping from cell line and cnv genetic feature to mutation status:",
     len(map_from_cl_id_and_genetic_feature_to_mutation_status))

Loading summary:
Drug annotations: (267, 5)
Cell line list (1065, 6)
Gene expression (17737, 1019)
CNV binary: (419050, 9)
Coding variants: (295740, 9)
Drug response: (224202, 13)
DrugBank mapping (number of matched drugs): 88
Gene expression signatures: (128, 1018)
Number of entries in mapping from cell line and cnv genetic feature to mutation status: 419050


# Machine learning section
Check predictive performance with different experimental settings.

### Elastic Net with Z-score normalization for all features and exhaustive grid search for hyperparameter tuning - all data classes (expression, mutation, CNV, tissue) but without Merck gene expression signatures

Initialize dictionary with DrugWithDrugBank objects and compute input data for all drugs.

In [4]:
# Create drug objects
drugs = DrugWithDrugBank.create_drugs(drug_annotations_df, map_drugs_to_drugbank_targets)
print(len(drugs))

# Set up data types we want to include in our input for each drug
data_types = ["CNV", "mutation", "expression", "tissue"]
# Create input data
Experiment.create_input_for_each_drug(drugs, drug_response_df, data_combination=data_types, 
                                     gene_expression_df=gene_expression_df, 
                                     cnv_binary_df=cnv_binary_df,
                                     map_cl_id_and_feature_to_status=map_from_cl_id_and_genetic_feature_to_mutation_status,
                                     cell_lines_list_df=cell_lines_list_df,
                                     coding_variants_df=coding_variants_df,
                                     feat_threshold=16,
                                     log=True)

267
10 drugs done
20 drugs done
30 drugs done
40 drugs done
50 drugs done
60 drugs done
70 drugs done
80 drugs done
90 drugs done
100 drugs done
110 drugs done
120 drugs done
130 drugs done
140 drugs done
150 drugs done
160 drugs done
170 drugs done
180 drugs done
190 drugs done
200 drugs done
210 drugs done
220 drugs done
230 drugs done
240 drugs done
250 drugs done
260 drugs done
Number of drugs with number of features bigger than 16: 184
Mean number of features in 267 drugs: 3.7265917602996255


In [5]:
#############################################################################################################
# Set up constants, feature normalization, predictive algorithm and initialize experiment instance
#############################################################################################################

log = True
counter = 0
sensitivity_metric = "AUC"
record_weights = True   # Does the model has coefficients we want to record?
n_folds = 10   # Number of folds in parameter tuning cross-validation
n_combinations = 20   # Number of parameter combinations to try (when using randomized search)

scaler = preprocessing.StandardScaler()   # Setup transformer for feature scaling

estimator = ElasticNet(max_iter=2000)   # Setup algorithm to use

# Hyperparameter space to search on
param_grid = {"estimator__alpha": [0.001, 0.01, 0.1, 1., 5., 10., 30., 50., 100., 300.],
                 "estimator__l1_ratio": [0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.]}

# Setup seeds for data split
split_seed = 37

# Function to be optimized during hyperparameter tuning
scoring = "neg_mean_squared_error"   # Function to be optimized by hyperparameters

# Initialize Experiment object for this assay
exp = Experiment(name="Only targets-Elastic Net with Z-score normalization",
                algorithm="ElasticNet",
                parameter_search_type="GridSearch",
                data_normalization_type="StandardScaler on all features",
                split_seed=split_seed,
                kfolds=n_folds)

#############################################################################################################
# Enter the loop over drugs
#############################################################################################################

for ide in drugs:
    drug = drugs[ide]   # Current Drug object
    data = drug.full_data  # Extract input data (should be previously computed)
    if data.shape[0] == 0:   # Check if data exists, if not, skip the drug
        continue
    if data.shape[1] < 16:    # That means that data has only features related to tissue
        continue           # so also skip this case
        
    # Add input data to corresponding Experiment objects's field (dictionary)
    exp.input_data[(drug.name, ide)] = data
    if log:
        print(drug.name, data.shape)
        
    # Add data shapes to corresponding field in Experiment (dictionary)
    exp.data_shapes[(drug.name, ide)] = data.shape
    
    y = data[sensitivity_metric]   # Extract response variable
    X = data.drop(["cell_line_id", sensitivity_metric], axis=1)     # Drop targets and cell line IDs
    assert X.shape[0] == y.shape[0]   # Sanity check
    assert X.shape[1] < data.shape[1]
    
    # Split data into training and test set
    test_size = 0.3    # Fraction of data spent on test set
    X_val, X_test, y_val, y_test = model_selection.train_test_split(X, y, test_size=test_size,
                                                                   random_state=split_seed)
    
    # Create collections.namedtuple for storing
    StandardDeviations = collections.namedtuple("StandardDeviations", ["validation", "test", "overall"])
    stds = StandardDeviations(y_val.std(), y_test.std(), y.std())
    # Record in corresponding field (dictionary)
    exp.data_stds[(drug.name, ide)] = stds    
    
    # Set up and fit Dummy Regressor
    dummy = DummyRegressor()
    dummy.fit(X_val, y_val)
    # Get dummy predictions on the test set
    dummy_preds = dummy.predict(X_test)
    # Baseline dummy performance
    baseline_scores = model_selection.cross_val_score(dummy, X_val, y_val, scoring=scoring, cv=n_folds)
    baseline_mean = np.mean([(-x)**0.5 for x in baseline_scores])
    baseline_std = np.std([(-x)**0.5 for x in baseline_scores])
    
    # Set elements of the pipeline, i.e. scaler and estimator
    transformer = clone(scaler)
    predictor = clone(estimator)
    
    # Create pipeline
    main_pipeline = Pipeline([
        ("scaler", transformer),
        ("estimator", predictor)
    ])    
    
    # Set up grid search with cross-validation
    grid = model_selection.GridSearchCV(main_pipeline, param_grid=param_grid, 
                                       scoring=scoring, cv=n_folds)
    
    # Fit the grid
    grid.fit(X_val, y_val)
    best = grid.best_estimator_   # Best model
    pred = grid.predict(X_test)   # Predict on test data with refitted best model
    training_preds = grid.predict(X_val)    # Predict on train data with refitted best model
    
    if log:   # Report the results
        # Summarize grid search
        print("Best CV score (RMSE):", (-grid.best_score_)**0.5)
        print("Best parameters:", grid.best_params_)
        print("")
        
        # Print out performance of the best model on training set
        print("RMSE on training data:", metrics.mean_squared_error(y_val, training_preds)**0.5)
        print("Correlation with the training set:", pearsonr(y_val, training_preds))
        print("")
        
        # Print out the performance of the best model on test set
        print("RMSE on test data:", metrics.mean_squared_error(y_test, pred) ** 0.5)
        print("Mean absolute error on test data:", metrics.mean_absolute_error(y_test, pred))
        print("Correlation with the test set:", pearsonr(y_test, pred))
        print("")
        
        # Print out performance of dummy regressor
        print("Dummy CV RMSE:", baseline_mean)
        print("Dummy test RMSE:", metrics.mean_squared_error(y_test, dummy_preds) ** 0.5)
        
    # Record results in corresponding Experiment fields, mostly as named tuples
    # Classification performance
    ModelTestScores = collections.namedtuple("ModelTestScores", ["cv_best_score", "test_RMSE",
                                                    "test_explained_variance", "test_correlation"])
    model_test_scores = ModelTestScores(grid.best_score_, metrics.mean_squared_error(y_test, pred) ** 0.5,
                                       metrics.explained_variance_score(y_test, pred), pearsonr(y_test, pred))
    # Record in appropriate field (dictionary)
    exp.best_scores[(drug.name, ide)] = model_test_scores
    
    # Full results of cross-validation tuning
    exp.cv_results[(drug.name, ide)] = grid.cv_results_
    
    # Parameters of best found model
    exp.best_parameters[(drug.name, ide)] = grid.best_params_
    
    # Coefficients (if model has them)
    if record_weights:
        BestModelWeights = collections.namedtuple("BestModelWeights", ["intercept", "weights_array"])
        best_model_weights = BestModelWeights(best.named_steps["estimator"].intercept_,
                                         best.named_steps["estimator"].coef_)
        # Record in appropriate field (dictionary)
        exp.coefficients[(drug.name, ide)] = best_model_weights
    
    # Performance on the training set
    ModelTrainingScores = collections.namedtuple("ModelTrainingScores", ["training_RMSE", "training_correlation"])
    model_training_scores = ModelTrainingScores(metrics.mean_squared_error(y_val, training_preds) ** 0.5,
                                               pearsonr(y_val, training_preds))
    # Record in appropriate field (dictionary)
    exp.training_scores[(drug.name, ide)] = model_training_scores
    
    # Performance of dummy model
    DummyScores = collections.namedtuple("DummyScores", ["cv_RMSE", "test_RMSE", "test_explained_variance",
                                            "test_correlation"])
    dummy_performance = DummyScores(baseline_mean, metrics.mean_squared_error(y_test, dummy_preds) ** 0.5,
                                   metrics.explained_variance_score(y_test, dummy_preds),
                                   pearsonr(y_test, dummy_preds))
    # Record in appropriate field (dictionary)
    exp.dummy_scores[(drug.name, ide)] = dummy_performance
    
    # Increment counter and display summary information
    counter += 1
    if log:
        print(counter, "drugs done")
        print("*" * 50)
        print("")

Erlotinib (347, 19)
Best CV score (RMSE): 0.08331016504245249
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.0787708250192746
Correlation with the training set: (0.47371132315354386, 6.112863286005931e-15)

RMSE on test data: 0.06138808687115606
Mean absolute error on test data: 0.038037574184868125
Correlation with the test set: (0.5407944601565197, 2.5951217528987297e-09)

Dummy CV RMSE: 0.07699414121333494
Dummy test RMSE: 0.07096557172657372
1 drugs done
**************************************************

Rapamycin (362, 16)
Best CV score (RMSE): 0.22729242064038202
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.21878975061719969
Correlation with the training set: (0.3141587751006189, 3.3605672623170146e-07)

RMSE on test data: 0.23751598294189738
Mean absolute error on test data: 0.17220821311597512
Correlation with the test set: (0.09262748416106704, 0.3380770172476105)

Dummy CV R

Best CV score (RMSE): 0.23255874215332678
Best parameters: {'estimator__alpha': 0.1, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.20416508923251306
Correlation with the training set: (0.6255486057886255, 2.0006320349836017e-29)

RMSE on test data: 0.244766174899222
Mean absolute error on test data: 0.1838633845773216
Correlation with the test set: (0.40599436395149074, 9.826982220600028e-06)

Dummy CV RMSE: 0.25989510330242505
Dummy test RMSE: 0.26432504424334047
16 drugs done
**************************************************

GNF-2 (370, 16)
Best CV score (RMSE): 0.03944734518947572
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.035918321285195384
Correlation with the training set: (0.584761675838488, 3.7410296900847344e-25)

RMSE on test data: 0.05038448916361781
Mean absolute error on test data: 0.022247017812979015
Correlation with the test set: (0.6059096273996679, 1.841153051745101e-12)

Dummy CV RMSE: 0.03673795070066

Best CV score (RMSE): 0.04438216599524135
Best parameters: {'estimator__alpha': 5.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.04373898630906445
Correlation with the training set: (0.294864567163117, 8.108104682226973e-07)

RMSE on test data: 0.04474615198392261
Mean absolute error on test data: 0.03474905189144844
Correlation with the test set: (0.13984012836357376, 0.13434633828580245)

Dummy CV RMSE: 0.044147600662650645
Dummy test RMSE: 0.045038342399505944
31 drugs done
**************************************************

Salubrinal (398, 16)
Best CV score (RMSE): 0.06309324556312434
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.5}

RMSE on training data: 0.06213746921355861
Correlation with the training set: (0.29047215385221103, 8.308505911062668e-07)

RMSE on test data: 0.052254250564227264
Mean absolute error on test data: 0.04366206814143524
Correlation with the test set: (0.167224472600486, 0.06791723497773737)

Dummy CV RMSE: 0.064215250067

Best CV score (RMSE): 0.16395859783432432
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.16123198762931554
Correlation with the training set: (0.27071610166516746, 1.0073347155399685e-11)

RMSE on test data: 0.15827428429088608
Mean absolute error on test data: 0.1277783889965341
Correlation with the test set: (0.21314114139293033, 0.0005010791283024402)

Dummy CV RMSE: 0.16698174502464552
Dummy test RMSE: 0.1619104537464577
46 drugs done
**************************************************

AKT inhibitor VIII (823, 21)
Best CV score (RMSE): 0.055443662083053194
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.054273114897791755
Correlation with the training set: (0.34921771890645703, 5.801301794639626e-18)

RMSE on test data: 0.04897466626721131
Mean absolute error on test data: 0.03627396476281454
Correlation with the test set: (0.305907497363247, 9.519043397445755e-07)

Dummy CV RMSE:

Best CV score (RMSE): 0.15689792006051462
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.5}

RMSE on training data: 0.15532402026218553
Correlation with the training set: (0.38079798623566474, 1.7442763848611542e-22)

RMSE on test data: 0.1553439237159153
Mean absolute error on test data: 0.11940924092127692
Correlation with the test set: (0.3358561428039123, 2.5034421372713134e-08)

Dummy CV RMSE: 0.16764569637815901
Dummy test RMSE: 0.1650067198534384
61 drugs done
**************************************************

Epothilone B (848, 29)
Best CV score (RMSE): 0.21324209923828247
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.2084487589525925
Correlation with the training set: (0.29165961663642426, 4.308306775677904e-13)

RMSE on test data: 0.21010067860486456
Mean absolute error on test data: 0.17878608992766173
Correlation with the test set: (0.19587130548393888, 0.0016725640706163128)

Dummy CV RMSE: 0.2159339

Best CV score (RMSE): 0.04634746207239535
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.04446048118924148
Correlation with the training set: (0.5637565433204812, 5.789722889353137e-52)

RMSE on test data: 0.04994821610422197
Mean absolute error on test data: 0.023078554763091584
Correlation with the test set: (0.3326551107059051, 4.142277520137907e-08)

Dummy CV RMSE: 0.04788330134872616
Dummy test RMSE: 0.05271661695848376
76 drugs done
**************************************************

JW-7-24-1 (922, 16)
Best CV score (RMSE): 0.140481524169621
Best parameters: {'estimator__alpha': 0.1, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.13692514679789963
Correlation with the training set: (0.5466053556746762, 1.5888980860911983e-51)

RMSE on test data: 0.1326888308733486
Mean absolute error on test data: 0.10229961638830326
Correlation with the test set: (0.5488368066530132, 3.431847727304178e-23)

Dummy CV RMSE: 0.163175895588

Best CV score (RMSE): 0.161875705573739
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.1579321563879567
Correlation with the training set: (0.40075324133683643, 1.1427242090833793e-24)

RMSE on test data: 0.15451392629653685
Mean absolute error on test data: 0.10545058507133337
Correlation with the test set: (0.36703031155715116, 1.1139552770536007e-09)

Dummy CV RMSE: 0.1684128697353152
Dummy test RMSE: 0.16590057570519276
91 drugs done
**************************************************

Omipalisib (922, 16)
Best CV score (RMSE): 0.19640611705321814
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 1.0}

RMSE on training data: 0.19431584085234638
Correlation with the training set: (0.42174204856978936, 3.3384796655472895e-29)

RMSE on test data: 0.20104079118567234
Mean absolute error on test data: 0.1621706400774928
Correlation with the test set: (0.35232597536568616, 1.6207031408680162e-09)

Dummy CV RMSE: 0.213314

Best CV score (RMSE): 0.06466678052393711
Best parameters: {'estimator__alpha': 0.001, 'estimator__l1_ratio': 1.0}

RMSE on training data: 0.06361547489971361
Correlation with the training set: (0.529043213345286, 8.44933282528062e-48)

RMSE on test data: 0.06426729916862893
Mean absolute error on test data: 0.04319809611429543
Correlation with the test set: (0.4879426822694338, 5.630358628808056e-18)

Dummy CV RMSE: 0.07333486764701581
Dummy test RMSE: 0.07364775846829204
106 drugs done
**************************************************

XMD13-2 (921, 16)
Best CV score (RMSE): 0.07436502815792721
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.0734937453728315
Correlation with the training set: (0.4904366924731734, 2.8155701376905776e-40)

RMSE on test data: 0.07201415171233456
Mean absolute error on test data: 0.05178057812864948
Correlation with the test set: (0.4844587705446274, 1.0434895941606302e-17)

Dummy CV RMSE: 0.08362834178

Best CV score (RMSE): 0.1845141013674615
Best parameters: {'estimator__alpha': 0.1, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.18012127773461545
Correlation with the training set: (0.3055245123559606, 2.277090335167138e-13)

RMSE on test data: 0.14867306909305328
Mean absolute error on test data: 0.11816455090215681
Correlation with the test set: (0.3291254450916456, 2.1576184336946797e-07)

Dummy CV RMSE: 0.1890160437979527
Dummy test RMSE: 0.1578399638241456
121 drugs done
**************************************************

Olaparib (827, 19)
Best CV score (RMSE): 0.034946609763664954
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.03447774723559141
Correlation with the training set: (0.2322538187097696, 1.6129075812573052e-08)

RMSE on test data: 0.04846310592304355
Mean absolute error on test data: 0.02829492439051925
Correlation with the test set: (0.14655483503373662, 0.020697073934079865)

Dummy CV RMSE: 0.0343152381286

Best CV score (RMSE): 0.026982147573292294
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.026659726639141495
Correlation with the training set: (0.2227696146781179, 3.9816477570952055e-08)

RMSE on test data: 0.022614532207551438
Mean absolute error on test data: 0.01642361993848852
Correlation with the test set: (0.14404406677303883, 0.021396690772282675)

Dummy CV RMSE: 0.026682502125907274
Dummy test RMSE: 0.02283223790198004
136 drugs done
**************************************************

PLX-4720 (799, 17)
Best CV score (RMSE): 0.07097646805077157
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.069570719755299
Correlation with the training set: (0.6022235601003204, 1.8304986698796783e-56)

RMSE on test data: 0.08197293628472704
Mean absolute error on test data: 0.047917717791630204
Correlation with the test set: (0.6435117411949529, 1.8726731347102843e-29)

Dummy CV RMSE: 0.08317

Best CV score (RMSE): 0.0655203820008125
Best parameters: {'estimator__alpha': 0.001, 'estimator__l1_ratio': 0.9}

RMSE on training data: 0.06392349812670332
Correlation with the training set: (0.6026983826057237, 2.7094271075107503e-55)

RMSE on test data: 0.04890677788505599
Mean absolute error on test data: 0.025641459693296002
Correlation with the test set: (0.6022477441281248, 1.7356392989706176e-24)

Dummy CV RMSE: 0.0740923653050532
Dummy test RMSE: 0.061230974082928935
151 drugs done
**************************************************

CCT007093 (899, 17)
Best CV score (RMSE): 0.02339870453530536
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.023082488994372687
Correlation with the training set: (0.26552449821402785, 1.306369201010672e-11)

RMSE on test data: 0.025954890333073116
Mean absolute error on test data: 0.01540603378489511
Correlation with the test set: (0.17302785264408188, 0.004351547984707394)

Dummy CV RMSE: 0.02341

Best CV score (RMSE): 0.0270228618135337
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.1}

RMSE on training data: 0.026489209272823517
Correlation with the training set: (0.2487691165036067, 2.1308348814607544e-10)

RMSE on test data: 0.034731419035383626
Mean absolute error on test data: 0.017513460172375775
Correlation with the test set: (0.04700138690826696, 0.4401004459296848)

Dummy CV RMSE: 0.02594573458608332
Dummy test RMSE: 0.0346393759598115
166 drugs done
**************************************************

QL-XII-61 (473, 17)
Best CV score (RMSE): 0.07026310685237327
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.0679398642269802
Correlation with the training set: (0.39267598458831937, 1.196903901806515e-13)

RMSE on test data: 0.0629836844962041
Mean absolute error on test data: 0.04863336707522938
Correlation with the test set: (0.30628737775432097, 0.00020966516955333203)

Dummy CV RMSE: 0.0714512057

Best CV score (RMSE): 0.18197496239143707
Best parameters: {'estimator__alpha': 0.1, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.17848367659374753
Correlation with the training set: (0.42300373863877, 2.022322905146593e-30)

RMSE on test data: 0.19867794626756966
Mean absolute error on test data: 0.16095712481949803
Correlation with the test set: (0.27261982026576104, 2.766928503863535e-06)

Dummy CV RMSE: 0.19648721909111883
Dummy test RMSE: 0.2059482795580716
181 drugs done
**************************************************

Olaparib (884, 19)
Best CV score (RMSE): 0.053407779755108103
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.1}

RMSE on training data: 0.05228914731164847
Correlation with the training set: (0.2732362627190461, 4.821865524630136e-12)

RMSE on test data: 0.0631263247277073
Mean absolute error on test data: 0.0398156460887064
Correlation with the test set: (0.34063056590001495, 1.1928634717294151e-08)

Dummy CV RMSE: 0.0531345680350

Save this Experiment instance

In [6]:
# Create a filename out of the Experiment's "name" field
filename = exp.name.lower().replace(" ", "_") + ".pkl"
# Save the instance
with open("../Created data/Results/" + filename, "wb") as f:
    dill.dump(exp, f)

### RandomForestRegression with Z-score normalization for all features and randomized search for hyperparameter tuning - all data classes (expression, mutation, CNV, tissue) but without Merck gene expression signatures

Initialize dictionary with DrugWithDrugBank objects and compute input data for all drugs.

In [9]:
# Create drug objects
drugs = DrugWithDrugBank.create_drugs(drug_annotations_df, map_drugs_to_drugbank_targets)
print(len(drugs))

# Set up data types we want to include in our input for each drug
data_types = ["CNV", "mutation", "expression", "tissue"]
# Create input data
Experiment.create_input_for_each_drug(drugs, drug_response_df, data_combination=data_types, 
                                     gene_expression_df=gene_expression_df, 
                                     cnv_binary_df=cnv_binary_df,
                                     map_cl_id_and_feature_to_status=map_from_cl_id_and_genetic_feature_to_mutation_status,
                                     cell_lines_list_df=cell_lines_list_df,
                                     coding_variants_df=coding_variants_df,
                                     feat_threshold=16,
                                     log=True)

267
10 drugs done
20 drugs done
30 drugs done
40 drugs done
50 drugs done
60 drugs done
70 drugs done
80 drugs done
90 drugs done
100 drugs done
110 drugs done
120 drugs done
130 drugs done
140 drugs done
150 drugs done
160 drugs done
170 drugs done
180 drugs done
190 drugs done
200 drugs done
210 drugs done
220 drugs done
230 drugs done
240 drugs done
250 drugs done
260 drugs done
Number of drugs with number of features bigger than 16: 184
Mean number of features in 267 drugs: 3.7265917602996255


In [5]:
#############################################################################################################
# Set up constants, feature normalization, predictive algorithm and initialize experiment instance
#############################################################################################################

log = True
counter = 0
sensitivity_metric = "AUC"
record_weights = False   # Does the model has coefficients we want to record?
n_folds = 5   # Number of folds in parameter tuning cross-validation
n_combinations = 20   # Number of parameter combinations to try (when using randomized search)

scaler = preprocessing.StandardScaler()   # Setup transformer for feature scaling

estimator = RandomForestRegressor()   # Setup algorithm to use

# Hyperparameter space to search on
# Number of trees in random forest
n_estimators = [10, 20, 50, 100, 200, 500]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(2, 101, num = 10)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(2, 101, num = 10)]
# Method of selecting samples for training each tree
criterion = ["mse", "mae"]

# Create the param grid
param_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__criterion': criterion}

# Set up seeds for data split and randomized parameter tuning
split_seed = 37
tuning_seed = 53

# Function to be optimized during hyperparameter tuning
scoring = "neg_mean_squared_error"   # Function to be optimized by hyperparameters

# Initialize Experiment object for this assay
exp = Experiment(name="Only targets-Random Forest with Z-score normalization",
                algorithm="RandomForestRegressor",
                parameter_search_type="RandomizedSearch",
                data_normalization_type="StandardScaler on all features",
                kfolds=n_folds,
                split_seed=split_seed,
                tuning_seed=tuning_seed)

#############################################################################################################
# Enter the loop over drugs
#############################################################################################################

for ide in drugs:
    drug = drugs[ide]   # Current Drug object
    data = drug.full_data  # Extract input data (should be previously computed)
    if data.shape[0] == 0:   # Check if data exists, if not, skip the drug
        continue
    if data.shape[1] < 16:    # That means that data has only features related to tissue
        continue           # so also skip this case
        
    # Add input data to corresponding Experiment objects's field (dictionary)
    exp.input_data[(drug.name, ide)] = data
    if log:
        print(drug.name, data.shape)
        
    # Add data shapes to corresponding field in Experiment (dictionary)
    exp.data_shapes[(drug.name, ide)] = data.shape
    
    y = data[sensitivity_metric]   # Extract response variable
    X = data.drop(["cell_line_id", sensitivity_metric], axis=1)     # Drop targets and cell line IDs
    assert X.shape[0] == y.shape[0]   # Sanity check
    assert X.shape[1] < data.shape[1]
    
    # Split data into training and test set
    test_size = 0.3    # Fraction of data spent on test set
    X_val, X_test, y_val, y_test = model_selection.train_test_split(X, y, test_size=test_size,
                                                                   random_state=split_seed)
    
    # Record standard deviations of data
    # Create collections.collections.namedtuple for storing
    StandardDeviations = collections.namedtuple("StandardDeviations", ["validation", "test", "overall"])
    stds = StandardDeviations(y_val.std(), y_test.std(), y.std())
    # Record in corresponding field (dictionary)
    exp.data_stds[(drug.name, ide)] = stds    
    
    # Set up and fit Dummy Regressor
    dummy = DummyRegressor()
    dummy.fit(X_val, y_val)
    # Get dummy predictions on the test set
    dummy_preds = dummy.predict(X_test)
    # Baseline dummy performance
    baseline_scores = model_selection.cross_val_score(dummy, X_val, y_val, scoring=scoring, cv=n_folds)
    baseline_mean = np.mean([(-x)**0.5 for x in baseline_scores])
    baseline_std = np.std([(-x)**0.5 for x in baseline_scores])
    
    # Set elements of the pipeline, i.e. scaler and estimator
    transformer = clone(scaler)
    predictor = clone(estimator)   # Make a copy to ensure that for every drug we get "fresh" predictor
    
    # Create pipeline
    main_pipeline = Pipeline([
        ("scaler", transformer),
        ("estimator", predictor)
    ])    
    
#     # Set up grid search with cross-validation
#     grid = model_selection.GridSearchCV(main_pipeline, param_grid=param_grid, 
#                                        scoring=scoring, cv=n_folds)
    
    # Or use RandomizedSearch
    grid = model_selection.RandomizedSearchCV(main_pipeline, param_distributions=param_grid,
                                             n_iter=n_combinations, scoring=scoring, cv=n_folds,
                                             random_state=tuning_seed)
    
    # Fit the grid
    grid.fit(X_val, y_val)
    best = grid.best_estimator_   # Best model
    pred = grid.predict(X_test)   # Predict on test data with refitted best model
    training_preds = grid.predict(X_val)    # Predict on train data with refitted best model
    
    if log:   # Report the results
        # Summarize grid search
        print("Best CV score (RMSE):", (-grid.best_score_)**0.5)
        print("Best parameters:", grid.best_params_)
        print("")
        
        # Print out performance of the best model on training set
        print("RMSE on training data:", metrics.mean_squared_error(y_val, training_preds)**0.5)
        print("Correlation with the training set:", pearsonr(y_val, training_preds))
        print("")
        
        # Print out the performance of the best model on test set
        print("RMSE on test data:", metrics.mean_squared_error(y_test, pred) ** 0.5)
        print("Mean absolute error on test data:", metrics.mean_absolute_error(y_test, pred))
        print("Correlation with the test set:", pearsonr(y_test, pred))
        print("")
        
        # Print out performance of dummy regressor
        print("Dummy CV RMSE:", baseline_mean)
        print("Dummy test RMSE:", metrics.mean_squared_error(y_test, dummy_preds) ** 0.5)
        
    # Record results in corresponding Experiment fields, mostly as named tuples
    # Classification performance
    ModelTestScores = collections.namedtuple("ModelTestScores", ["cv_best_score", "test_RMSE",
                                                    "test_explained_variance", "test_correlation"])
    model_test_scores = ModelTestScores(grid.best_score_, metrics.mean_squared_error(y_test, pred) ** 0.5,
                                       metrics.explained_variance_score(y_test, pred), pearsonr(y_test, pred))
    # Record in appropriate field (dictionary)
    exp.best_scores[(drug.name, ide)] = model_test_scores
    
    # Full results of cross-validation tuning
    exp.cv_results[(drug.name, ide)] = grid.cv_results_
    
    # Parameters of best found model
    exp.best_parameters[(drug.name, ide)] = grid.best_params_
    
    # Coefficients (if model has them)
    if record_weights:
        BestModelWeights = collections.namedtuple("BestModelWeights", ["intercept", "weights_array"])
        best_model_weights = BestModelWeights(best.named_steps["estimator"].intercept_,
                                         best.named_steps["estimator"].coef_)
        # Record in appropriate field (dictionary)
        exp.coefficients[(drug.name, ide)] = best_model_weights
    
    # Performance on the training set
    ModelTrainingScores = collections.namedtuple("ModelTrainingScores", ["training_RMSE", "training_correlation"])
    model_training_scores = ModelTrainingScores(metrics.mean_squared_error(y_val, training_preds) ** 0.5,
                                               pearsonr(y_val, training_preds))
    # Record in appropriate field (dictionary)
    exp.training_scores[(drug.name, ide)] = model_training_scores
    
    # Performance of dummy model
    DummyScores = collections.namedtuple("DummyScores", ["cv_RMSE", "test_RMSE", "test_explained_variance",
                                            "test_correlation"])
    dummy_performance = DummyScores(baseline_mean, metrics.mean_squared_error(y_test, dummy_preds) ** 0.5,
                                   metrics.explained_variance_score(y_test, dummy_preds),
                                   pearsonr(y_test, dummy_preds))
    # Record in appropriate field (dictionary)
    exp.dummy_scores[(drug.name, ide)] = dummy_performance
    
    # Increment counter and display summary information
    counter += 1
    if log:
        print(counter, "drugs done")
        print("*" * 50)
        print("")

Erlotinib (347, 19)
Best CV score (RMSE): 0.08192895188001664
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.07789852663734911
Correlation with the training set: (0.4859686225513045, 9.529330929220756e-16)

RMSE on test data: 0.06264247825828283
Mean absolute error on test data: 0.03841156290849046
Correlation with the test set: (0.48350898063643905, 1.7441774498702933e-07)

Dummy CV RMSE: 0.08390702315605628
Dummy test RMSE: 0.07096557172657372
1 drugs done
**************************************************

Rapamycin (362, 16)
Best CV score (RMSE): 0.22786238423760885
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on trainin

Best CV score (RMSE): 0.07057537608962157
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.05849640270416974
Correlation with the training set: (0.6668825809569338, 1.851553867767928e-35)

RMSE on test data: 0.037976412511410554
Mean absolute error on test data: 0.02512683723582091
Correlation with the test set: (0.6119394532680287, 4.714150147191803e-13)

Dummy CV RMSE: 0.07362992936516966
Dummy test RMSE: 0.048153759043624574
13 drugs done
**************************************************

Saracatinib (406, 17)
Best CV score (RMSE): 0.08535481565273678
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.0778716

Best CV score (RMSE): 0.1177621212387812
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.11293290581381815
Correlation with the training set: (0.34351816067098573, 3.568428621523405e-09)

RMSE on test data: 0.11841405991049077
Mean absolute error on test data: 0.08490925315939536
Correlation with the test set: (0.2247256920493405, 0.013208177301412171)

Dummy CV RMSE: 0.11835498978154275
Dummy test RMSE: 0.12136745255786242
25 drugs done
**************************************************

JW-7-52-1 (358, 17)
Best CV score (RMSE): 0.22482735012519614
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.216507946256

Best CV score (RMSE): 0.3111181392798395
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.27607914097060576
Correlation with the training set: (0.5638975562713575, 7.953663926310995e-51)

RMSE on test data: 0.29235811838935016
Mean absolute error on test data: 0.25411334002644237
Correlation with the test set: (0.28230951037373303, 5.083979936369114e-06)

Dummy CV RMSE: 0.3150778122249769
Dummy test RMSE: 0.30484467357967293
37 drugs done
**************************************************

Bicalutamide (870, 16)
Best CV score (RMSE): 0.012155585408060558
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 79, 'estimator__min_samples_leaf': 35, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 70, 'estimator__criterion': 'mse'}

RMSE on training data: 0.01201432

Best CV score (RMSE): 0.10920284066488938
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.10594387090531052
Correlation with the training set: (0.4056512539887519, 7.423605467115249e-25)

RMSE on test data: 0.10140995836956096
Mean absolute error on test data: 0.08567458873384531
Correlation with the test set: (0.3556668563885619, 5.10122330901825e-09)

Dummy CV RMSE: 0.11557968317749952
Dummy test RMSE: 0.10826747234358583
49 drugs done
**************************************************

GSK650394 (840, 18)
Best CV score (RMSE): 0.12339463008529648
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 46, 'estimator__min_samples_leaf': 46, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.12215508975

Best CV score (RMSE): 0.15618064535672324
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.14586578103438733
Correlation with the training set: (0.5020043129831164, 2.9623031309165774e-40)

RMSE on test data: 0.1571569659125318
Mean absolute error on test data: 0.1225680373697816
Correlation with the test set: (0.31297925481786126, 2.3120865810311252e-07)

Dummy CV RMSE: 0.16796128826555917
Dummy test RMSE: 0.1650067198534384
61 drugs done
**************************************************

Epothilone B (848, 29)
Best CV score (RMSE): 0.21195011577137354
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.19574389

Best CV score (RMSE): 0.14922566705208354
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.13660516390311883
Correlation with the training set: (0.608503608156018, 1.8645528468188018e-62)

RMSE on test data: 0.15425917997626792
Mean absolute error on test data: 0.11588010636405904
Correlation with the test set: (0.4792253324708913, 2.8149755623847507e-16)

Dummy CV RMSE: 0.17108559328583092
Dummy test RMSE: 0.17569371607430592
73 drugs done
**************************************************

XMD14-99 (863, 20)
Best CV score (RMSE): 0.03268360606109935
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.02987890839

Best CV score (RMSE): 0.18521738423216597
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 24, 'estimator__max_features': 'auto', 'estimator__max_depth': 100, 'estimator__criterion': 'mae'}

RMSE on training data: 0.17743504375722413
Correlation with the training set: (0.5429258747326727, 1.549180607230697e-46)

RMSE on test data: 0.20981934068106695
Mean absolute error on test data: 0.17294232862992126
Correlation with the test set: (0.33347927682678685, 5.173642087454086e-08)

Dummy CV RMSE: 0.21073737259750497
Dummy test RMSE: 0.2194496671441799
85 drugs done
**************************************************

Belinostat (836, 28)
Best CV score (RMSE): 0.2065499869285901
Best parameters: {'estimator__n_estimators': 100, 'estimator__min_samples_split': 35, 'estimator__min_samples_leaf': 35, 'estimator__max_features': 'auto', 'estimator__max_depth': None, 'estimator__criterion': 'mse'}

RMSE on training data: 0.195003

Best CV score (RMSE): 0.12308292013712184
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.12077429040476188
Correlation with the training set: (0.5524416033459453, 9.836810382353999e-53)

RMSE on test data: 0.13551193022958552
Mean absolute error on test data: 0.08836261726591906
Correlation with the test set: (0.4877210563546144, 6.720956135547799e-18)

Dummy CV RMSE: 0.14380385733845222
Dummy test RMSE: 0.1555047126969616
97 drugs done
**************************************************

OSI-930 (862, 19)
Best CV score (RMSE): 0.05527039778888828
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.04537638616407

Best CV score (RMSE): 0.05157420145005419
Best parameters: {'estimator__n_estimators': 100, 'estimator__min_samples_split': 35, 'estimator__min_samples_leaf': 35, 'estimator__max_features': 'auto', 'estimator__max_depth': None, 'estimator__criterion': 'mse'}

RMSE on training data: 0.05009934154585543
Correlation with the training set: (0.36043308373202804, 1.6172293853093717e-20)

RMSE on test data: 0.07550469851847104
Mean absolute error on test data: 0.022774269212923498
Correlation with the test set: (0.32811555867981507, 4.0419477572145354e-08)

Dummy CV RMSE: 0.04839477641404758
Dummy test RMSE: 0.07888257012812597
109 drugs done
**************************************************

Selisistat (917, 16)
Best CV score (RMSE): 0.01617638399310106
Best parameters: {'estimator__n_estimators': 10, 'estimator__min_samples_split': 46, 'estimator__min_samples_leaf': 101, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 10, 'estimator__criterion': 'mse'}

RMSE on training data: 0.

Best CV score (RMSE): 0.18505229328392778
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.17833949582288497
Correlation with the training set: (0.3482016415398293, 3.77038697383135e-17)

RMSE on test data: 0.1477241138051084
Mean absolute error on test data: 0.11801549602886964
Correlation with the test set: (0.35256343194732515, 2.4172946759797762e-08)

Dummy CV RMSE: 0.18938291011860264
Dummy test RMSE: 0.1578399638241456
121 drugs done
**************************************************

Olaparib (827, 19)
Best CV score (RMSE): 0.03526595841216408
Best parameters: {'estimator__n_estimators': 10, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 68, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 40, 'estimator__criterion': 'mse'}

RMSE on training data: 0.035151706198

Best CV score (RMSE): 0.05532509105888916
Best parameters: {'estimator__n_estimators': 100, 'estimator__min_samples_split': 35, 'estimator__min_samples_leaf': 35, 'estimator__max_features': 'auto', 'estimator__max_depth': None, 'estimator__criterion': 'mse'}

RMSE on training data: 0.05333192444495171
Correlation with the training set: (0.30645444426556206, 1.4868735173266213e-13)

RMSE on test data: 0.0404682280022078
Mean absolute error on test data: 0.022808426494679648
Correlation with the test set: (0.38792020311328385, 5.27951319873584e-10)

Dummy CV RMSE: 0.05087105693777575
Dummy test RMSE: 0.04369550302048499
133 drugs done
**************************************************

KU-55933 (798, 17)
Best CV score (RMSE): 0.03853877455629316
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 46, 'estimator__min_samples_leaf': 46, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.03830

Best CV score (RMSE): 0.044564709500630274
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.0427095504283273
Correlation with the training set: (0.3613747497785941, 9.186232565079147e-20)

RMSE on test data: 0.04864304837296848
Mean absolute error on test data: 0.033504115795099314
Correlation with the test set: (0.27862175714154763, 6.268007798279797e-06)

Dummy CV RMSE: 0.04517839441116491
Dummy test RMSE: 0.05050498070097953
145 drugs done
**************************************************

MK-2206 (772, 19)
Best CV score (RMSE): 0.13041058303644873
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.1242292833

Best CV score (RMSE): 0.1022719770281993
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.09773852203667824
Correlation with the training set: (0.3500878391410612, 3.5149182487963197e-11)

RMSE on test data: 0.09610651476770556
Mean absolute error on test data: 0.0767355091138088
Correlation with the test set: (0.25274198462965386, 0.002085549052734018)

Dummy CV RMSE: 0.10329452749208108
Dummy test RMSE: 0.09915311198570526
157 drugs done
**************************************************

HG-5-88-01 (466, 18)
Best CV score (RMSE): 0.09173925420682152
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.0873169135

Best CV score (RMSE): 0.07801197009072289
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.07570669721269874
Correlation with the training set: (0.40100594742168355, 1.2754631665264064e-26)

RMSE on test data: 0.07639419227496619
Mean absolute error on test data: 0.05869589413322247
Correlation with the test set: (0.1848500721499844, 0.001895679598447278)

Dummy CV RMSE: 0.08211008909685688
Dummy test RMSE: 0.07680926847141752
169 drugs done
**************************************************

IOX2 (907, 17)
Best CV score (RMSE): 0.035348267076731964
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.0344685445670

Best CV score (RMSE): 0.18348759612014667
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.17658354160752465
Correlation with the training set: (0.4520692993986532, 5.245608075818352e-35)

RMSE on test data: 0.19960827282508498
Mean absolute error on test data: 0.1628368149439533
Correlation with the test set: (0.2547262605121462, 1.2494123716895814e-05)

Dummy CV RMSE: 0.19671011099603103
Dummy test RMSE: 0.2059482795580716
181 drugs done
**************************************************

Olaparib (884, 19)
Best CV score (RMSE): 0.05404665013595639
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.050741198644

Save this Experiment instance

In [6]:
# Create a filename out of the Experiment's "name" field
filename = exp.name.lower().replace(" ", "_") + ".pkl"
print(filename)
# Save the instance
with open("../Created data/Results/" + filename, "wb") as f:
    dill.dump(exp, f)

only_targets-random_forest_with_z-score_normalization.pkl


### Elastic Net with Z-score normalization for all features and exhaustive grid search for hyperparameter tuning - all data classes (expression, mutation, CNV, tissue) + Merck gene expression signatures

Initialize dictionary with DrugWithDrugBank objects and compute input data for them

In [4]:
# Create drug objects
drugs = DrugWithDrugBank.create_drugs(drug_annotations_df, map_drugs_to_drugbank_targets)
print(len(drugs))

# Set up data types we want to include in our input for each drug
data_types = ["CNV", "mutation", "expression", "tissue", "merck signatures"]
# Create input data
Experiment.create_input_for_each_drug(drugs, drug_response_df, data_combination=data_types, 
                                     gene_expression_df=gene_expression_df, 
                                     cnv_binary_df=cnv_binary_df,
                                     map_cl_id_and_feature_to_status=map_from_cl_id_and_genetic_feature_to_mutation_status,
                                     cell_lines_list_df=cell_lines_list_df,
                                     coding_variants_df=coding_variants_df,
                                     merck_signatures_df=signatures_df,
                                     feat_threshold=144,
                                     log=True)


267
10 drugs done
20 drugs done
30 drugs done
40 drugs done
50 drugs done
60 drugs done
70 drugs done
80 drugs done
90 drugs done
100 drugs done
110 drugs done
120 drugs done
130 drugs done
140 drugs done
150 drugs done
160 drugs done
170 drugs done
180 drugs done
190 drugs done
200 drugs done
210 drugs done
220 drugs done
230 drugs done
240 drugs done
250 drugs done
260 drugs done
Number of drugs with number of features bigger than 144: 184
Mean number of features in 267 drugs: 131.24719101123594


In [5]:
#############################################################################################################
# Set up constants, feature normalization, predictive algorithm and initialize experiment instance
#############################################################################################################

log = True
counter = 0
sensitivity_metric = "AUC"
record_weights = True   # Does the model has coefficients we want to record?
n_folds = 10   # Number of folds in parameter tuning cross-validation
n_combinations = 20   # Number of parameter combinations to try (when using randomized search)

scaler = preprocessing.StandardScaler()   # Setup transformer for feature scaling

estimator = ElasticNet(max_iter=2000)   # Setup algorithm to use

# Hyperparameter space to search on
param_grid = {"estimator__alpha": [0.001, 0.01, 0.1, 1., 5., 10., 30., 50., 100., 300.],
                 "estimator__l1_ratio": [0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.]}

# Setup seeds for data split
split_seed = 37

# Function to be optimized during hyperparameter tuning
scoring = "neg_mean_squared_error"   # Function to be optimized by hyperparameters

# Initialize Experiment object for this assay
exp = Experiment(name="Only targets with merck signatures-Elastic Net with Z-score normalization",
                algorithm="ElasticNet",
                parameter_search_type="GridSearch",
                data_normalization_type="StandardScaler on all features",
                split_seed=split_seed,
                kfolds=n_folds)

#############################################################################################################
# Enter the loop over drugs
#############################################################################################################

for ide in drugs:
    drug = drugs[ide]   # Current Drug object
    data = drug.full_data  # Extract input data (should be previously computed)
    if data.shape[0] == 0:   # Check if data exists, if not, skip the drug
        continue
    if data.shape[1] < 144:    # That means that data has only features related to tissue
        continue           # so also skip this case
        
    # Add input data to corresponding Experiment objects's field (dictionary)
    exp.input_data[(drug.name, ide)] = data
    if log:
        print(drug.name, data.shape)
        
    # Add data shapes to corresponding field in Experiment (dictionary)
    exp.data_shapes[(drug.name, ide)] = data.shape
    
    y = data[sensitivity_metric]   # Extract response variable
    X = data.drop(["cell_line_id", sensitivity_metric], axis=1)     # Drop targets and cell line IDs
    assert X.shape[0] == y.shape[0]   # Sanity check
    assert X.shape[1] < data.shape[1]
    
    # Split data into training and test set
    test_size = 0.3    # Fraction of data spent on test set
    X_val, X_test, y_val, y_test = model_selection.train_test_split(X, y, test_size=test_size,
                                                                   random_state=split_seed)
    
    # Create collections.namedtuple for storing
    StandardDeviations = collections.namedtuple("StandardDeviations", ["validation", "test", "overall"])
    stds = StandardDeviations(y_val.std(), y_test.std(), y.std())
    # Record in corresponding field (dictionary)
    exp.data_stds[(drug.name, ide)] = stds    
    
    # Set up and fit Dummy Regressor
    dummy = DummyRegressor()
    dummy.fit(X_val, y_val)
    # Get dummy predictions on the test set
    dummy_preds = dummy.predict(X_test)
    # Baseline dummy performance
    baseline_scores = model_selection.cross_val_score(dummy, X_val, y_val, scoring=scoring, cv=n_folds)
    baseline_mean = np.mean([(-x)**0.5 for x in baseline_scores])
    baseline_std = np.std([(-x)**0.5 for x in baseline_scores])
    
    # Set elements of the pipeline, i.e. scaler and estimator
    transformer = scaler
    predictor = clone(estimator)
    
    # Create pipeline
    main_pipeline = Pipeline([
        ("scaler", transformer),
        ("estimator", predictor)
    ])    
    
    # Set up grid search with cross-validation
    grid = model_selection.GridSearchCV(main_pipeline, param_grid=param_grid, 
                                       scoring=scoring, cv=n_folds)
    
    # Fit the grid
    grid.fit(X_val, y_val)
    best = grid.best_estimator_   # Best model
    pred = grid.predict(X_test)   # Predict on test data with refitted best model
    training_preds = grid.predict(X_val)    # Predict on train data with refitted best model
    
    if log:   # Report the results
        # Summarize grid search
        print("Best CV score (RMSE):", (-grid.best_score_)**0.5)
        print("Best parameters:", grid.best_params_)
        print("")
        
        # Print out performance of the best model on training set
        print("RMSE on training data:", metrics.mean_squared_error(y_val, training_preds)**0.5)
        print("Correlation with the training set:", pearsonr(y_val, training_preds))
        print("")
        
        # Print out the performance of the best model on test set
        print("RMSE on test data:", metrics.mean_squared_error(y_test, pred) ** 0.5)
        print("Mean absolute error on test data:", metrics.mean_absolute_error(y_test, pred))
        print("Correlation with the test set:", pearsonr(y_test, pred))
        print("")
        
        # Print out performance of dummy regressor
        print("Dummy CV RMSE:", baseline_mean)
        print("Dummy test RMSE:", metrics.mean_squared_error(y_test, dummy_preds) ** 0.5)
        
    # Record results in corresponding Experiment fields, mostly as named tuples
    # Classification performance
    ModelTestScores = collections.namedtuple("ModelTestScores", ["cv_best_score", "test_RMSE",
                                                    "test_explained_variance", "test_correlation"])
    model_test_scores = ModelTestScores(grid.best_score_, metrics.mean_squared_error(y_test, pred) ** 0.5,
                                       metrics.explained_variance_score(y_test, pred), pearsonr(y_test, pred))
    # Record in appropriate field (dictionary)
    exp.best_scores[(drug.name, ide)] = model_test_scores
    
    # Full results of cross-validation tuning
    exp.cv_results[(drug.name, ide)] = grid.cv_results_
    
    # Parameters of best found model
    exp.best_parameters[(drug.name, ide)] = grid.best_params_
    
    # Coefficients (if model has them)
    if record_weights:
        BestModelWeights = collections.namedtuple("BestModelWeights", ["intercept", "weights_array"])
        best_model_weights = BestModelWeights(best.named_steps["estimator"].intercept_,
                                         best.named_steps["estimator"].coef_)
        # Record in appropriate field (dictionary)
        exp.coefficients[(drug.name, ide)] = best_model_weights
    
    # Performance on the training set
    ModelTrainingScores = collections.namedtuple("ModelTrainingScores", ["training_RMSE", "training_correlation"])
    model_training_scores = ModelTrainingScores(metrics.mean_squared_error(y_val, training_preds) ** 0.5,
                                               pearsonr(y_val, training_preds))
    # Record in appropriate field (dictionary)
    exp.training_scores[(drug.name, ide)] = model_training_scores
    
    # Performance of dummy model
    DummyScores = collections.namedtuple("DummyScores", ["cv_RMSE", "test_RMSE", "test_explained_variance",
                                            "test_correlation"])
    dummy_performance = DummyScores(baseline_mean, metrics.mean_squared_error(y_test, dummy_preds) ** 0.5,
                                   metrics.explained_variance_score(y_test, dummy_preds),
                                   pearsonr(y_test, dummy_preds))
    # Record in appropriate field (dictionary)
    exp.dummy_scores[(drug.name, ide)] = dummy_performance
    
    # Increment counter and display summary information
    counter += 1
    if log:
        print(counter, "drugs done")
        print("*" * 50)
        print("")

Erlotinib (347, 147)
Best CV score (RMSE): 0.08120291865173236
Best parameters: {'estimator__alpha': 5.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.07630734986277565
Correlation with the training set: (0.5172220123741602, 5.868314066133642e-18)

RMSE on test data: 0.06413236771815596
Mean absolute error on test data: 0.039198860332930116
Correlation with the test set: (0.4299575143069548, 4.707016052079047e-06)

Dummy CV RMSE: 0.07699414121333494
Dummy test RMSE: 0.07096557172657372
1 drugs done
**************************************************

Rapamycin (362, 144)
Best CV score (RMSE): 0.22324135566945788
Best parameters: {'estimator__alpha': 5.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.21198581990200463
Correlation with the training set: (0.4520300228075026, 3.819159533435406e-14)

RMSE on test data: 0.23318365513907446
Mean absolute error on test data: 0.16696642351703633
Correlation with the test set: (0.21540178735473808, 0.0244865229934914)

Dummy CV R

Best CV score (RMSE): 0.216915204250626
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.5}

RMSE on training data: 0.17104227403243022
Correlation with the training set: (0.7647448838846853, 8.810114130927631e-51)

RMSE on test data: 0.26030889548307556
Mean absolute error on test data: 0.190134177833353
Correlation with the test set: (0.33840032510305956, 0.00028047639950015506)

Dummy CV RMSE: 0.25989510330242505
Dummy test RMSE: 0.26432504424334047
16 drugs done
**************************************************

GNF-2 (370, 144)
Best CV score (RMSE): 0.03923335884767753
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.03440223963735584
Correlation with the training set: (0.7104785034993525, 4.2166655068500107e-41)

RMSE on test data: 0.05302462296235967
Mean absolute error on test data: 0.024628300775499302
Correlation with the test set: (0.5615365179225754, 1.4411001186648766e-10)

Dummy CV RMSE: 0.03673795070066

Best CV score (RMSE): 0.044477062223604924
Best parameters: {'estimator__alpha': 50.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.04394532776196716
Correlation with the training set: (0.2292614077814403, 0.00014445489981028447)

RMSE on test data: 0.044647422889630445
Mean absolute error on test data: 0.034852862708624775
Correlation with the test set: (0.19906848135556773, 0.03216901539974498)

Dummy CV RMSE: 0.044147600662650645
Dummy test RMSE: 0.045038342399505944
31 drugs done
**************************************************

Salubrinal (398, 144)
Best CV score (RMSE): 0.06314587292981573
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.5}

RMSE on training data: 0.06107163367079944
Correlation with the training set: (0.3508589640536589, 1.7882787829968335e-09)

RMSE on test data: 0.052362151654206414
Mean absolute error on test data: 0.04331658284938267
Correlation with the test set: (0.18146593355067267, 0.04730562149096253)

Dummy CV RMSE: 0.064

Best CV score (RMSE): 0.16170434532134062
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.5}

RMSE on training data: 0.15505648704937844
Correlation with the training set: (0.3951605925901983, 2.860176095129387e-24)

RMSE on test data: 0.15789359544121953
Mean absolute error on test data: 0.12581927178636318
Correlation with the test set: (0.2201119953914306, 0.00032212409442966273)

Dummy CV RMSE: 0.16698174502464552
Dummy test RMSE: 0.1619104537464577
46 drugs done
**************************************************

AKT inhibitor VIII (823, 149)
Best CV score (RMSE): 0.05510713996030728
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.05289689269705973
Correlation with the training set: (0.41010995118064575, 8.966931304478017e-25)

RMSE on test data: 0.04912325937872402
Mean absolute error on test data: 0.03651584154933531
Correlation with the test set: (0.2975462763506324, 1.927746102156921e-06)

Dummy CV RMSE: 0

Best CV score (RMSE): 0.1542092807483076
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.75}

RMSE on training data: 0.1498250891942239
Correlation with the training set: (0.4572333835113499, 7.625345919780927e-33)

RMSE on test data: 0.15584991028751163
Mean absolute error on test data: 0.11899020002875164
Correlation with the test set: (0.3347451891704232, 2.800790103437064e-08)

Dummy CV RMSE: 0.16764569637815901
Dummy test RMSE: 0.1650067198534384
61 drugs done
**************************************************

Epothilone B (848, 157)
Best CV score (RMSE): 0.2087168311972843
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.19677205539447862
Correlation with the training set: (0.44570182335098746, 2.776298214717006e-30)

RMSE on test data: 0.20400128673280088
Mean absolute error on test data: 0.17537130608690146
Correlation with the test set: (0.3145336951920127, 2.916200883897301e-07)

Dummy CV RMSE: 0.2159339271

Best CV score (RMSE): 0.0467162363570052
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.043861229900651645
Correlation with the training set: (0.580520907897873, 1.050558603615748e-55)

RMSE on test data: 0.05071639664332255
Mean absolute error on test data: 0.0226755078151363
Correlation with the test set: (0.2941522605517265, 1.4496822811998677e-06)

Dummy CV RMSE: 0.04788330134872616
Dummy test RMSE: 0.05271661695848376
76 drugs done
**************************************************

JW-7-24-1 (922, 144)
Best CV score (RMSE): 0.14006007135552478
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.75}

RMSE on training data: 0.1350725977807164
Correlation with the training set: (0.5673229625327876, 3.255079342983682e-56)

RMSE on test data: 0.12962377989999702
Mean absolute error on test data: 0.0999199583281538
Correlation with the test set: (0.5812369761406293, 1.9825857247257782e-26)

Dummy CV RMSE: 0.1631758955

Best CV score (RMSE): 0.15387709304208233
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.1452616181735132
Correlation with the training set: (0.5583832042713623, 9.968426334433109e-51)

RMSE on test data: 0.14697792711140198
Mean absolute error on test data: 0.10135189924039917
Correlation with the test set: (0.46707047730450896, 1.9380712001366226e-15)

Dummy CV RMSE: 0.1684128697353152
Dummy test RMSE: 0.16590057570519276
91 drugs done
**************************************************

Omipalisib (922, 144)
Best CV score (RMSE): 0.19428179935498804
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.1836606657919528
Correlation with the training set: (0.5216490987932753, 2.6986786850491004e-46)

RMSE on test data: 0.19336514136431682
Mean absolute error on test data: 0.1571358539969347
Correlation with the test set: (0.4334733927428089, 4.071478368232782e-14)

Dummy CV RMSE: 0.213314417053

Best CV score (RMSE): 0.06513233232566878
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.06170893589719518
Correlation with the training set: (0.57306682825252, 1.4223881570689603e-57)

RMSE on test data: 0.06315329610986606
Mean absolute error on test data: 0.042391935251211085
Correlation with the test set: (0.5177801653815952, 2.1182161113448085e-20)

Dummy CV RMSE: 0.07333486764701581
Dummy test RMSE: 0.07364775846829204
106 drugs done
**************************************************

XMD13-2 (921, 144)
Best CV score (RMSE): 0.07156383552563429
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.0680498528023081
Correlation with the training set: (0.5954849476444108, 4.747976093001731e-63)

RMSE on test data: 0.07087768061184829
Mean absolute error on test data: 0.050704755482966886
Correlation with the test set: (0.5071893407701691, 1.6363657798142854e-19)

Dummy CV RMSE: 0.08362834178

Best CV score (RMSE): 0.1738256886999125
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.16426501192629717
Correlation with the training set: (0.5167071507847342, 6.16376870890368e-39)

RMSE on test data: 0.13861794239276762
Mean absolute error on test data: 0.1087697396495528
Correlation with the test set: (0.46700142395421723, 3.063263242813765e-14)

Dummy CV RMSE: 0.1890160437979527
Dummy test RMSE: 0.1578399638241456
121 drugs done
**************************************************

Olaparib (827, 147)
Best CV score (RMSE): 0.034344050123234875
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.1}

RMSE on training data: 0.03292542094366573
Correlation with the training set: (0.3716885937730662, 2.2426896872239295e-20)

RMSE on test data: 0.048391907098133694
Mean absolute error on test data: 0.028565844888663587
Correlation with the test set: (0.15971827114872772, 0.011607644656315306)

Dummy CV RMSE: 0.03431523812

Best CV score (RMSE): 0.027120828486307952
Best parameters: {'estimator__alpha': 10.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.026767806952538477
Correlation with the training set: (0.23056277556844418, 1.277648027862552e-08)

RMSE on test data: 0.022554533471170323
Mean absolute error on test data: 0.016172262935379066
Correlation with the test set: (0.17420763521595134, 0.0052778495473622886)

Dummy CV RMSE: 0.026682502125907274
Dummy test RMSE: 0.02283223790198004
136 drugs done
**************************************************

PLX-4720 (799, 145)
Best CV score (RMSE): 0.07080830914262036
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.25}

RMSE on training data: 0.06831496624612975
Correlation with the training set: (0.6216643578324755, 4.435917701830309e-61)

RMSE on test data: 0.08209220541283618
Mean absolute error on test data: 0.048461158045828866
Correlation with the test set: (0.6401551308700828, 4.501635890121371e-29)

Dummy CV RMSE: 0.0

Best CV score (RMSE): 0.06529453173146325
Best parameters: {'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.5}

RMSE on training data: 0.06352785311768704
Correlation with the training set: (0.6146928254582112, 4.830253955104958e-58)

RMSE on test data: 0.04756484515927484
Mean absolute error on test data: 0.024787530850507552
Correlation with the test set: (0.6269326962236319, 5.81932295235863e-27)

Dummy CV RMSE: 0.0740923653050532
Dummy test RMSE: 0.061230974082928935
151 drugs done
**************************************************

CCT007093 (899, 145)
Best CV score (RMSE): 0.022971910193358917
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.021848303722834944
Correlation with the training set: (0.41243199563716676, 3.1552028634600855e-27)

RMSE on test data: 0.02564788836495321
Mean absolute error on test data: 0.015098250462787547
Correlation with the test set: (0.23253659236078542, 0.00011514834332460554)

Dummy CV RMSE: 0.023

Best CV score (RMSE): 0.027112639962949946
Best parameters: {'estimator__alpha': 10.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.026647678667852524
Correlation with the training set: (0.23862012308559852, 1.1681680075891727e-09)

RMSE on test data: 0.03486521201868462
Mean absolute error on test data: 0.017512787180956856
Correlation with the test set: (-0.025761269174569994, 0.6723103666068708)

Dummy CV RMSE: 0.02594573458608332
Dummy test RMSE: 0.0346393759598115
166 drugs done
**************************************************

QL-XII-61 (473, 145)
Best CV score (RMSE): 0.06991364349417295
Best parameters: {'estimator__alpha': 5.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.06632311352697733
Correlation with the training set: (0.44302660431051005, 2.408580053786858e-17)

RMSE on test data: 0.06099176840623788
Mean absolute error on test data: 0.04725700898798855
Correlation with the test set: (0.4000034324684948, 8.141084850961643e-07)

Dummy CV RMSE: 0.07145

Best CV score (RMSE): 0.17284618652970735
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.16483569665011197
Correlation with the training set: (0.5598910980786763, 1.8384583932752998e-56)

RMSE on test data: 0.19038437144851353
Mean absolute error on test data: 0.15364844698835858
Correlation with the test set: (0.38243812596360977, 1.98211913289769e-11)

Dummy CV RMSE: 0.19648721909111883
Dummy test RMSE: 0.2059482795580716
181 drugs done
**************************************************

Olaparib (884, 147)
Best CV score (RMSE): 0.052960474648125554
Best parameters: {'estimator__alpha': 1.0, 'estimator__l1_ratio': 0.0}

RMSE on training data: 0.05023571387480761
Correlation with the training set: (0.40657956875964435, 5.303996833777446e-26)

RMSE on test data: 0.06202079604265804
Mean absolute error on test data: 0.03885553887675822
Correlation with the test set: (0.4060411308118223, 5.569615899560067e-12)

Dummy CV RMSE: 0.0531345680

Save this Experiment instance

In [6]:
# Create a filename out of the Experiment's "name" field
filename = exp.name.lower().replace(" ", "_") + ".pkl"
print(filename)
# Save the instance
with open("../Created data/Results/" + filename, "wb") as f:
    dill.dump(exp, f)

only_targets_with_merck_signatures-elastic_net_with_z-score_normalization.pkl


### Random Forest Regressor with Z-score normalization for all features and randomized grid search for hyperparameter tuning - all data classes (expression, mutation, CNV, tissue) + Merck gene expression signatures

Initialize dictionary with DrugWithDrugBank objects and compute input data for them

In [4]:
# Create drug objects
drugs = DrugWithDrugBank.create_drugs(drug_annotations_df, map_drugs_to_drugbank_targets)
print(len(drugs))

# Set up data types we want to include in our input for each drug
data_types = ["CNV", "mutation", "expression", "tissue", "merck signatures"]
# Create input data
Experiment.create_input_for_each_drug(drugs, drug_response_df, data_combination=data_types, 
                                     gene_expression_df=gene_expression_df, 
                                     cnv_binary_df=cnv_binary_df,
                                     map_cl_id_and_feature_to_status=map_from_cl_id_and_genetic_feature_to_mutation_status,
                                     cell_lines_list_df=cell_lines_list_df,
                                     coding_variants_df=coding_variants_df,
                                     merck_signatures_df=signatures_df,
                                     feat_threshold=144,
                                     log=True)

267
10 drugs done
20 drugs done
30 drugs done
40 drugs done
50 drugs done
60 drugs done
70 drugs done
80 drugs done
90 drugs done
100 drugs done
110 drugs done
120 drugs done
130 drugs done
140 drugs done
150 drugs done
160 drugs done
170 drugs done
180 drugs done
190 drugs done
200 drugs done
210 drugs done
220 drugs done
230 drugs done
240 drugs done
250 drugs done
260 drugs done
Number of drugs with number of features bigger than 144: 184
Mean number of features in 267 drugs: 131.24719101123594


In [5]:
#############################################################################################################
# Set up constants, feature normalization, predictive algorithm and initialize experiment instance
#############################################################################################################

log = True
counter = 0
sensitivity_metric = "AUC"
record_weights = False   # Does the model has coefficients we want to record?
n_folds = 5   # Number of folds in parameter tuning cross-validation
n_combinations = 20   # Number of parameter combinations to try (when using randomized search)

scaler = preprocessing.StandardScaler()   # Setup transformer for feature scaling

estimator = RandomForestRegressor()   # Setup algorithm to use

# Hyperparameter space to search on
# Number of trees in random forest
n_estimators = [10, 20, 50, 100, 200, 500]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(2, 101, num = 10)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(2, 101, num = 10)]
# Method of selecting samples for training each tree
criterion = ["mse", "mae"]

# Create the param grid
param_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__criterion': criterion}

# Set up seeds for data split and randomized parameter tuning
split_seed = 37
tuning_seed = 53

# Function to be optimized during hyperparameter tuning
scoring = "neg_mean_squared_error"   # Function to be optimized by hyperparameters

# Initialize Experiment object for this assay
exp = Experiment(name="Only targets with merck signatures-Random Forest with Z-score normalization",
                algorithm="RandomForestRegressor",
                parameter_search_type="RandomizedSearch",
                data_normalization_type="StandardScaler on all features",
                kfolds=n_folds,
                split_seed=split_seed,
                tuning_seed=tuning_seed)

# Load previous interrupted Experiment
load = True
if load:
    filename = exp.name.lower().replace(" ", "_") + ".pkl"
    with open("../Created data/Results/" + filename, "rb") as f:
        exp = dill.load(f)

#############################################################################################################
# Enter the loop over drugs
#############################################################################################################

for ide in drugs:
    drug = drugs[ide]   # Current Drug object
    
    if (drug.name, ide) in exp.training_scores:
        continue   # If drug was already modeled in this Experiment, skip it
        
    data = drug.full_data  # Extract input data (should be previously computed)
    if data.shape[0] == 0:   # Check if data exists, if not, skip the drug
        continue
    if data.shape[1] < 144:    # That means that data has only features related to tissue
        continue           # so also skip this case
        
    # Reset drug data fields in order to save memory
    drug.full_data = None
    drug.merck_signatures = None
    drug.tissue_data = None
        
    # Add input data to corresponding Experiment objects's field (dictionary)
    exp.input_data[(drug.name, ide)] = data
    if log:
        print(drug.name, data.shape)
        
    # Add data shapes to corresponding field in Experiment (dictionary)
    exp.data_shapes[(drug.name, ide)] = data.shape
    
    y = data[sensitivity_metric]   # Extract response variable
    X = data.drop(["cell_line_id", sensitivity_metric], axis=1)     # Drop targets and cell line IDs
    assert X.shape[0] == y.shape[0]   # Sanity check
    assert X.shape[1] < data.shape[1]
    
    # Split data into training and test set
    test_size = 0.3    # Fraction of data spent on test set
    X_val, X_test, y_val, y_test = model_selection.train_test_split(X, y, test_size=test_size,
                                                                   random_state=split_seed)
    
    # Record standard deviations of data
    # Create collections.collections.namedtuple for storing
    StandardDeviations = collections.namedtuple("StandardDeviations", ["validation", "test", "overall"])
    stds = StandardDeviations(y_val.std(), y_test.std(), y.std())
    # Record in corresponding field (dictionary)
    exp.data_stds[(drug.name, ide)] = stds    
    
    # Set up and fit Dummy Regressor
    dummy = DummyRegressor()
    dummy.fit(X_val, y_val)
    # Get dummy predictions on the test set
    dummy_preds = dummy.predict(X_test)
    # Baseline dummy performance
    baseline_scores = model_selection.cross_val_score(dummy, X_val, y_val, scoring=scoring, cv=n_folds)
    baseline_mean = np.mean([(-x)**0.5 for x in baseline_scores])
    baseline_std = np.std([(-x)**0.5 for x in baseline_scores])
    
    # Set elements of the pipeline, i.e. scaler and estimator
    transformer = clone(scaler)
    predictor = clone(estimator)   # Make a copy to ensure that for every drug we get "fresh" predictor
    
    # Create pipeline
    main_pipeline = Pipeline([
        ("scaler", transformer),
        ("estimator", predictor)
    ])    
    
#     # Set up grid search with cross-validation
#     grid = model_selection.GridSearchCV(main_pipeline, param_grid=param_grid, 
#                                        scoring=scoring, cv=n_folds)
    
    # Or use RandomizedSearch
    grid = model_selection.RandomizedSearchCV(main_pipeline, param_distributions=param_grid,
                                             n_iter=n_combinations, scoring=scoring, cv=n_folds,
                                             random_state=tuning_seed)
    
    # Fit the grid
    grid.fit(X_val, y_val)
    best = grid.best_estimator_   # Best model
    pred = grid.predict(X_test)   # Predict on test data with refitted best model
    training_preds = grid.predict(X_val)    # Predict on train data with refitted best model
    
    if log:   # Report the results
        # Summarize grid search
        print("Best CV score (RMSE):", (-grid.best_score_)**0.5)
        print("Best parameters:", grid.best_params_)
        print("")
        
        # Print out performance of the best model on training set
        print("RMSE on training data:", metrics.mean_squared_error(y_val, training_preds)**0.5)
        print("Correlation with the training set:", pearsonr(y_val, training_preds))
        print("")
        
        # Print out the performance of the best model on test set
        print("RMSE on test data:", metrics.mean_squared_error(y_test, pred) ** 0.5)
        print("Mean absolute error on test data:", metrics.mean_absolute_error(y_test, pred))
        print("Correlation with the test set:", pearsonr(y_test, pred))
        print("")
        
        # Print out performance of dummy regressor
        print("Dummy CV RMSE:", baseline_mean)
        print("Dummy test RMSE:", metrics.mean_squared_error(y_test, dummy_preds) ** 0.5)
        
    # Record results in corresponding Experiment fields, mostly as named tuples
    # Classification performance
    ModelTestScores = collections.namedtuple("ModelTestScores", ["cv_best_score", "test_RMSE",
                                                    "test_explained_variance", "test_correlation"])
    model_test_scores = ModelTestScores(grid.best_score_, metrics.mean_squared_error(y_test, pred) ** 0.5,
                                       metrics.explained_variance_score(y_test, pred), pearsonr(y_test, pred))
    # Record in appropriate field (dictionary)
    exp.best_scores[(drug.name, ide)] = model_test_scores
    
    # Full results of cross-validation tuning
    exp.cv_results[(drug.name, ide)] = grid.cv_results_
    
    # Parameters of best found model
    exp.best_parameters[(drug.name, ide)] = grid.best_params_
    
    # Coefficients (if model has them)
    if record_weights:
        BestModelWeights = collections.namedtuple("BestModelWeights", ["intercept", "weights_array"])
        best_model_weights = BestModelWeights(best.named_steps["estimator"].intercept_,
                                         best.named_steps["estimator"].coef_)
        # Record in appropriate field (dictionary)
        exp.coefficients[(drug.name, ide)] = best_model_weights
    
    # Performance on the training set
    ModelTrainingScores = collections.namedtuple("ModelTrainingScores", ["training_RMSE", "training_correlation"])
    model_training_scores = ModelTrainingScores(metrics.mean_squared_error(y_val, training_preds) ** 0.5,
                                               pearsonr(y_val, training_preds))
    # Record in appropriate field (dictionary)
    exp.training_scores[(drug.name, ide)] = model_training_scores
    
    # Performance of dummy model
    DummyScores = collections.namedtuple("DummyScores", ["cv_RMSE", "test_RMSE", "test_explained_variance",
                                            "test_correlation"])
    dummy_performance = DummyScores(baseline_mean, metrics.mean_squared_error(y_test, dummy_preds) ** 0.5,
                                   metrics.explained_variance_score(y_test, dummy_preds),
                                   pearsonr(y_test, dummy_preds))
    # Record in appropriate field (dictionary)
    exp.dummy_scores[(drug.name, ide)] = dummy_performance
    
    # Increment counter and display summary information
    counter += 1
    if log:
        print(counter, "drugs done")
        print("*" * 50)
        print("")

GSK429286A (897, 146)
Best CV score (RMSE): 0.07043033950341852
Best parameters: {'estimator__n_estimators': 100, 'estimator__min_samples_split': 35, 'estimator__min_samples_leaf': 35, 'estimator__max_features': 'auto', 'estimator__max_depth': None, 'estimator__criterion': 'mse'}

RMSE on training data: 0.06552450004721036
Correlation with the training set: (0.4939239774908348, 7.130605687966033e-40)

RMSE on test data: 0.05415550795038702
Mean absolute error on test data: 0.03289751524577331
Correlation with the test set: (0.3248169124856245, 4.725995424344891e-08)

Dummy CV RMSE: 0.07265307325849626
Dummy test RMSE: 0.0572091068889053
1 drugs done
**************************************************

QL-XII-47 (920, 145)
Best CV score (RMSE): 0.17825949331626706
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 68, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on trai

Best CV score (RMSE): 0.2516740070959647
Best parameters: {'estimator__n_estimators': 50, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 2, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 100, 'estimator__criterion': 'mse'}

RMSE on training data: 0.21599954853087822
Correlation with the training set: (0.6820707425829625, 3.478368554561202e-86)

RMSE on test data: 0.25602368009854093
Mean absolute error on test data: 0.19097529014654763
Correlation with the test set: (0.25069401297294475, 3.4202545020481744e-05)

Dummy CV RMSE: 0.2586245593634898
Dummy test RMSE: 0.2644383733127208
13 drugs done
**************************************************

NSC-207895 (889, 145)
Best CV score (RMSE): 0.09253844972044857
Best parameters: {'estimator__n_estimators': 500, 'estimator__min_samples_split': 90, 'estimator__min_samples_leaf': 90, 'estimator__max_features': 'auto', 'estimator__max_depth': 30, 'estimator__criterion': 'mse'}

RMSE on training data: 0.090078507

KeyboardInterrupt: 

Save this Experiment instance

In [6]:
# Create a filename out of the Experiment's "name" field
filename = exp.name.lower().replace(" ", "_") + ".pkl"
print(filename)
# Save the instance
with open("../Created data/Results/" + filename, "wb") as f:
    dill.dump(exp, f)

only_targets_with_merck_signatures-random_forest_with_z-score_normalization.pkl
