In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import r2_score
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import gc

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import MinMaxScaler
import os

import scipy as sp

# _FOLDER = "/home/acq18mk/master/results/results/"
_FOLDER = "/home/juanjo/Work_Postdoc/my_codes_postdoc/DrugProfiles-master/results_with_NonAffecting_Drugs/"

### Coding Part

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
# #columns to normalise:
# with open(_FOLDER+"columns_to_normalise.txt", 'r') as f:
#     columns_to_normalise = [line.rstrip('\n') for line in f]
# # *****************************************

with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
# *****************************************

In [2]:
GDSC_Info = pd.read_csv(_FOLDER+"Cell_list_GDSC.csv")  #Contains info of cancer types for both GDSC1 and GDSC2

In [3]:
df_GDSC1 = GDSC_Info[GDSC_Info["Dataset"]=="GDSC1"] 

In [4]:
df_OneCancer = df_GDSC1[df_GDSC1["Tissue"]=="lung"].reset_index()
#df_OneCancer = df_GDSC1.copy().reset_index()

In [5]:
df_OneCancer["COSMICID"][0]

687596

In [6]:
all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]

train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

datasets = ["Dataset 1", "Dataset 2", "Dataset 3", "Dataset 4"]

X_feat_dict = {"Dataset 1": X_cancer_cell_lines ,
               "Dataset 2": ["MAX_CONC"] + X_targets + X_target_pathway + X_cancer_cell_lines ,
               "Dataset 3": ["MAX_CONC"] + X_PubChem_properties +  X_cancer_cell_lines,
               "Dataset 4": ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines}

### Coefficient_1

train_drug = train_df_50.copy()
test_drug = test_df_50.copy()
  
data_set = "Dataset 4" 
X_columns = X_feat_dict[data_set]

In [7]:
df_train_drug_new = train_drug[train_drug["COSMIC_ID"]==df_OneCancer["COSMICID"][0]]
for i in range(1,df_OneCancer.shape[0]):
    df_aux = train_drug[train_drug["COSMIC_ID"]==df_OneCancer["COSMICID"][i]]
    df_train_drug_new = pd.concat([df_train_drug_new, df_aux])
    
df_test_drug_new = test_drug[test_drug["COSMIC_ID"]==df_OneCancer["COSMICID"][0]]
for i in range(1,df_OneCancer.shape[0]):
    df_aux = test_drug[test_drug["COSMIC_ID"]==df_OneCancer["COSMICID"][i]]
    df_test_drug_new = pd.concat([df_test_drug_new, df_aux])

In [8]:
df_train_drug_new = df_train_drug_new.reset_index()
df_test_drug_new = df_test_drug_new.reset_index()

In [9]:
scaler = MinMaxScaler().fit(df_train_drug_new[X_columns])
Xtrain_drug = scaler.transform(df_train_drug_new[X_columns])
Xtest_drug = scaler.transform(df_test_drug_new[X_columns])

y_train_drug = np.clip(df_train_drug_new["norm_cells_"+str(1)].values[:,None],1.0e-9,1.0)
y_test_drug =  np.clip(df_test_drug_new["norm_cells_"+str(1)].values[:,None],1.0e-9,1.0)
print(y_train_drug.shape)
for i in range(2,10):
    y_train_drug = np.concatenate((y_train_drug,np.clip(df_train_drug_new["norm_cells_"+str(i)].values[:,None],1.0e-9,1.0)),1)
    y_test_drug = np.concatenate((y_test_drug,np.clip(df_test_drug_new["norm_cells_"+str(i)].values[:,None],1.0e-9,1.0)),1)    

(9108, 1)


In [10]:
### Training data for the GP ###
output_dim = y_train_drug.shape[1]
N_per_out = Xtrain_drug.shape[0]
Xall = Xtrain_drug.copy()  #
Yall = y_train_drug.copy()

### Testing data for the GP ###
N_per_out_test = Xtest_drug.shape[0]
Xtest = Xtest_drug.copy()  #
Ytest = y_test_drug.copy()

In [11]:
np.random.seed(1001)
ind_perm = np.random.permutation(np.arange(0, Yall.shape[0]))
perc_train = 0.75
Ntrain = int(Yall.shape[0]*perc_train)
#Nval = Ytrain.shape[0] - Ntrain 
Xval = Xall[ind_perm[Ntrain:],:].copy()
Xtrain = Xall[ind_perm[0:Ntrain],:].copy()
Yval = Yall[ind_perm[Ntrain:],:].copy()
Ytrain = Yall[ind_perm[0:Ntrain],:].copy()
print("Training shape:", Ytrain.shape)
print("Validation shape:", Yval.shape)

Training shape: (6831, 9)
Validation shape: (2277, 9)


In [14]:
path_to_save = "/home/juanjo/Work_Postdoc/Bench_Mark_DrugResponse_Models/Functional-Random-forest-master/"
np.savetxt(path_to_save+"Xtrain.csv", Xtrain, delimiter=",")
np.savetxt(path_to_save+"Ytrain.csv", Ytrain, delimiter=",")
np.savetxt(path_to_save+"Xtest.csv", Xtest, delimiter=",")
np.savetxt(path_to_save+"Ytest.csv", Ytest, delimiter=",")

(2262, 1352)