In [4]:
import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt

_FOLDER = "database/"
_FOLDER_2 = "figures/"
_FOLDER_3 = "results/"
SAVE_FIGURES = False

from functions.filtering import *
from functions.fitting import *
from pubchem_processing import *
from merging import *

In [5]:
functions = [
             "fsigmoid", 
             "sigmoid_2_param",
             "sigmoid_3_param",
             "sigmoid_4_param",
             "logistic_4_param",
            "ll4_4_param",
            "ll4R_4_param",
            "logLogist_3_param"]

load_drug_properties = True

In [6]:
drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data.csv").drop("Unnamed: 0", axis=1)
conc_columns= ["fd_num_"+str(i) for i in range(10)]
response_norm = ['norm_cells_'+str(i) for i in range(10)]
cell_features = pd.read_csv(_FOLDER +"Cell_Line_Features_PANCAN_simple_MOBEM.tsv", sep="\t")

In [7]:
drug_curves.shape[0] == drug_curves["DRUGID_COSMICID"].nunique()

True

In [8]:
if load_drug_properties:
    drug_features = pd.read_csv(_FOLDER+"drug_features_pubchem.csv")
else:
    # ~ 3 mins
    drugs_1 = pd.read_csv(_FOLDER+'drugs_gdsc1.csv')
    drug_features = pd.read_csv(_FOLDER + "Drug_Features.csv")
    drug_features = PreprocessDrugs(drug_features, drugs_1, save_features_names =True, _FOLDER_to_save = _FOLDER_3)
    drug_features.reset_index(inplace=True)
    drug_features.to_csv(_FOLDER+"drug_features_pubchem.csv", index=False)
# Total number of drugs: 250
# Number of not found drugs: 17
# Elements in drugs: 11 ['Pt', 'I', 'H', 'B', 'P', 'Br', 'O', 'S', 'N', 'Cl', 'F']
# Number of targets: 213
# Number of unique pathways: 23

FileNotFoundError: [Errno 2] No such file or directory: 'database/drug_features_pubchem.csv'

## as in MSc project

In [None]:
fitting_function = "sigmoid_4_param"
filtration_name = "filt_123_04"

df = drug_curves.copy()

df = FilteringSigmoidCurves(drug_curves, filtering_scenario=[1,2,3], \
                        response_columns = response_norm, \
                        first_points_lower_limit = 0.8, last_points_upper_limit = 0.4)

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df = MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function,
#                                save_CCL_properties = True, _FOLDER_to_save = _FOLDER_3
                              )

#merged_df[["param_"+str(i) for i in range(1,5)]+["sigmoid_4_param"]].head()

train, test, test2 = SplitTrainTestFor10Drugs(merged_df, train_ratio = 0.8)
train.dropna(axis=0, inplace=True)
test.dropna(axis=0, inplace=True)
test2.dropna(axis=0, inplace=True)

new_folder = _FOLDER + filtration_name
if filtration_name not in os.listdir(_FOLDER):
    os.makedirs(new_folder)
train.to_csv(new_folder+"/train.csv")
test.to_csv(new_folder+"/test.csv")
test2.to_csv(new_folder+"/test2.csv")
train.shape[0], test.shape[0], test2.shape[0]

In [None]:
fitting_function = "sigmoid_4_param"
filtration_name = "filt_123_04_r2_09"

df = drug_curves.copy()

df = FilteringSigmoidCurves(drug_curves, filtering_scenario=[1,2,3], \
                        response_columns = response_norm, \
                        first_points_lower_limit = 0.8, last_points_upper_limit = 0.4)

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df = MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function,
#                                save_CCL_properties = True, _FOLDER_to_save = _FOLDER_3
                              )
merged_df = merged_df[merged_df["sigmoid_4_param_r2"]>0.9]
#merged_df[["param_"+str(i) for i in range(1,5)]+["sigmoid_4_param"]].head()

train, test, test2 = SplitTrainTestFor10Drugs(merged_df, train_ratio = 0.8)
train.dropna(axis=0, inplace=True)
test.dropna(axis=0, inplace=True)
test2.dropna(axis=0, inplace=True)

new_folder = _FOLDER + filtration_name
if filtration_name not in os.listdir(_FOLDER):
    os.makedirs(new_folder)
train.to_csv(new_folder+"/train.csv")
test.to_csv(new_folder+"/test.csv")
test2.to_csv(new_folder+"/test2.csv")
train.shape[0], test.shape[0], test2.shape[0]

## Additional 4th stage

In [None]:
fitting_function = "sigmoid_4_param"
filtration_name = "filt_1234_04"

df = drug_curves.copy()

df = FilteringSigmoidCurves(drug_curves, filtering_scenario = [1,2,3,4], \
                        response_columns = response_norm, \
                        first_points_lower_limit = 0.8, last_points_upper_limit = 0.4)

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df = MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function)


train, test, test2 = SplitTrainTestFor10Drugs(merged_df, train_ratio = 0.8)
new_folder = _FOLDER + filtration_name
if filtration_name not in os.listdir(_FOLDER):
    os.makedirs(new_folder)
train.to_csv(new_folder+"/train.csv")
test.to_csv(new_folder+"/test.csv")
test2.to_csv(new_folder+"/test2.csv")
train.shape[0], test.shape[0], test2.shape[0]

## No filtering - only fitting

In [None]:
%%time

df = drug_curves.copy()

fitting_function = "sigmoid_4_param"
filtration_name = "no_filt"

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df = MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function)

merged_df = merged_df[merged_df["sigmoid_4_param_r2"]>0.9]

train, test, test2 = SplitTrainTestFor10Drugs(merged_df, train_ratio = 0.8)
new_folder = _FOLDER + filtration_name
if filtration_name not in os.listdir(_FOLDER):
    os.makedirs(new_folder)
train.to_csv(new_folder+"/train.csv")
test.to_csv(new_folder+"/test.csv")
test2.to_csv(new_folder+"/test2.csv")
print(train.shape[0], test.shape[0], test2.shape[0])

## AUC -filtering

In [None]:
df = pd.read_csv("results/filt_auc.csv")

fitting_function = "sigmoid_4_param"
filtration_name = "auc_filt"

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df = MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function)


train, test, test2 = SplitTrainTestFor10Drugs(merged_df, train_ratio = 0.8)
new_folder = _FOLDER + filtration_name
if filtration_name not in os.listdir(_FOLDER):
    os.makedirs(new_folder)
train.to_csv(new_folder+"/train.csv")
test.to_csv(new_folder+"/test.csv")
test2.to_csv(new_folder+"/test2.csv")
train.shape[0], test.shape[0], test2.shape[0]

In [None]:
df = pd.read_csv("results/filt_auc_02.csv")

fitting_function = "sigmoid_4_param"
filtration_name = "filt_auc_02"

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df = MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function)


train, test, test2 = SplitTrainTestFor10Drugs(merged_df, train_ratio = 0.8)
new_folder = _FOLDER + filtration_name
if filtration_name not in os.listdir(_FOLDER):
    os.makedirs(new_folder)
train.to_csv(new_folder+"/train.csv")
test.to_csv(new_folder+"/test.csv")
test2.to_csv(new_folder+"/test2.csv")
train.shape[0], test.shape[0], test2.shape[0]

## Test sets from GDSC2

In [None]:
drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data_GDCS2_EC_conc.csv", sep= "\t")
drug_curves.shape

In [None]:
conc_columns= ["fd_num_"+str(i) for i in range(8)]
response_norm = ['norm_cells_'+str(i) for i in range(8)]
load_drug_properties=False

In [None]:
%%time
if load_drug_properties:
    drug_features = pd.read_csv(_FOLDER+"drug_features_pubchem_gdsc2.csv")
else:
    # ~ 3 mins
    drug_features = pd.read_csv(_FOLDER + "drugs_gdsc2.csv")
    drug_features.columns = ['DRUG_ID', 'Drug_Name', 'Synonyms', 'Target_Pathway', 'Target', 'pubchem_id']
    drug_features = GetPubChemId(drug_features)
    drug_features = PreprocessDrugs(drug_features, drug_features_wih_pubchem_id = True, 
                                    save_features_names =False)
    drug_features.reset_index(inplace=True)
    drug_features = drug_features[drug_features["pubchem_id"]!= "-"]
    drug_features.to_csv(_FOLDER+"drug_features_pubchem_gdsc2.csv", index=False)
    
#columns with drug features can be different!

In [None]:
drug_features2 = pd.read_csv(_FOLDER+"drug_features_pubchem_gdsc2.csv")
drug_features2.shape                                        

In [None]:
drug_features2

In [None]:
fitting_function = "sigmoid_4_param"
filtration_name = "filt_123_04"

df = drug_curves.copy()

df = FilteringSigmoidCurves(drug_curves, filtering_scenario=[1,2,3], \
                        response_columns = response_norm, \
                        first_points_lower_limit = 0.8, last_points_upper_limit = 0.4)

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df= MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function,
#                                save_CCL_properties = True, _FOLDER_to_save = _FOLDER_3
                              )
new_folder = _FOLDER + filtration_name
merged_df.to_csv(new_folder+"/test_gdsc2.csv")
merged_df.shape[0]

In [None]:
df

## GDSC2 without fitting

In [None]:
%%time

df = drug_curves.copy()

fitting_function = "sigmoid_4_param"
filtration_name = "no_filt"

df = ComputeFittingFunction(df, fitting_function, conc_columns, response_norm)

merged_df = MergeDrugCells(df, cell_features, drug_features, 
                               splitting_needed= True,
                               param_col_name = fitting_function)

merged_df = merged_df[merged_df["sigmoid_4_param_r2"]>0.9]

new_folder = _FOLDER + filtration_name
merged_df.to_csv(new_folder+"/test_gdsc2.csv")
merged_df.shape[0]

In [None]:
merged_df