In [2]:
import sys
import joblib
import numpy as np
import pandas as pd
from tdc.single_pred import ADME
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import retrain_and_save, fp_from_smiles, parse_jazzy_df

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [3]:
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
feature_types = ["morgan", "jazzy"]
models = [ElasticNet, KernelRidge, GradientBoostingRegressor, RandomForestRegressor, MLPRegressor]

tdc_datasets = {}
smiles = {}
halflives = {}
mol_features = {}
best_model_hyperparams = {}

In [4]:
obach = ADME(name='Half_Life_Obach')
obach_split = obach.get_split()
tdc_datasets["obach"] = obach_split
microsome = ADME(name='Clearance_Microsome_AZ')
microsome_split = microsome.get_split()
tdc_datasets["microsome"] = microsome_split
hepatocyte = ADME(name='Clearance_Hepatocyte_AZ')
hepatocyte_split = hepatocyte.get_split()
tdc_datasets["hepatocyte"] = hepatocyte_split

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [5]:
# load, scale and save halflives for morgan
halflives["morgan"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    smiles[benchmark] = {}
    halflives["morgan"][benchmark] = {}
    
    benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
    benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
    smiles[benchmark]["train"] = np.array(benchmark_train_smiles)
    smiles[benchmark]["test"] = np.array(benchmark_test_smiles)
    
    benchmark_train_halflives = tdc_datasets[benchmark]["train"]["Y"]
    benchmark_test_halflives = tdc_datasets[benchmark]["test"]["Y"]
    
    #reshaped_train_halflife = np.array(benchmark_train_halflives).reshape(-1, 1)
    #scaler = MinMaxScaler().fit(reshaped_train_halflife)
    #train_halflife_scaled = scaler.transform(reshaped_train_halflife)
    #train_halflives_scaled = np.array([val[0] for val in train_halflife_scaled])

    #reshaped_test_halflife = np.array(benchmark_test_halflives).reshape(-1, 1)
    #scaler = MinMaxScaler().fit(reshaped_test_halflife)
    #test_halflife_scaled = scaler.transform(reshaped_test_halflife)
    #test_halflives_scaled = np.array([val[0] for val in test_halflife_scaled])
    train_halflives_scaled = benchmark_train_halflives
    test_halflives_scaled = benchmark_test_halflives
    
    halflives["morgan"][benchmark]["train"] = np.array(train_halflives_scaled)
    halflives["morgan"][benchmark]["test"] = np.array(test_halflives_scaled)
    
    print(halflives["morgan"].keys())
    
    print(benchmark_train_halflives.shape, benchmark_test_halflives.shape)

obach
dict_keys(['obach'])
(467,) (133,)
microsome
dict_keys(['obach', 'microsome'])
(772,) (220,)
hepatocyte
dict_keys(['obach', 'microsome', 'hepatocyte'])
(849,) (243,)


In [6]:
mol_features["morgan"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    mol_features["morgan"][benchmark] = {}
    train_morgan_fps = np.array(fp_from_smiles(smiles[benchmark]["train"]))
    test_morgan_fps = np.array(fp_from_smiles(smiles[benchmark]["test"]))
    mol_features["morgan"][benchmark]["train"] = train_morgan_fps
    mol_features["morgan"][benchmark]["test"] = test_morgan_fps
    print(train_morgan_fps.shape, test_morgan_fps.shape)

obach
(467, 124) (133, 124)
microsome
(772, 124) (220, 124)
hepatocyte
(849, 124) (243, 124)


In [7]:
mol_features["jazzy"] = {}
halflives["jazzy"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    train_jazzy_df = pd.read_csv(f"project_resources/jazzy_splits/{benchmark}_train.csv")
    test_jazzy_df = pd.read_csv(f"project_resources/jazzy_splits/{benchmark}_test.csv")
    train_fts, train_jazzy_thalfs, contains_nan = parse_jazzy_df(train_jazzy_df, no_idx_smi=True)
    test_fts, test_jazzy_thalfs, contains_nan = parse_jazzy_df(test_jazzy_df, no_idx_smi=True)
    
    mol_features["jazzy"][benchmark] = {}
    mol_features["jazzy"][benchmark]["train"] = train_fts
    mol_features["jazzy"][benchmark]["test"] = test_fts
    halflives["jazzy"][benchmark] = {}
    halflives["jazzy"][benchmark]["train"] = train_jazzy_thalfs
    halflives["jazzy"][benchmark]["test"] = test_jazzy_thalfs
    
    print(np.array(train_fts).shape, np.array(train_jazzy_thalfs).shape, np.array(test_fts).shape, np.array(test_jazzy_thalfs).shape)

obach
     525, [0.0033169983665033, 11.3305, 0.0, 3.3351, -4.054, -69.5826, -60.2942]
     130, [0.0078064793190067, 10.9705, 1.8136, 5.8249, -16.4281, -118.9807, -121.6516]
(525, 6) (525,) (130, 6) (130,)
microsome
     882, [0.0652380952380952, 10.5072, 1.4478, 4.6964, -13.5025, -92.8889, -106.3914]
     220, [0.0884353741496598, 3.5084, 3.5128, 4.5042, -12.6756, -80.3865, -84.2257]
(882, 6) (882,) (220, 6) (220,)
hepatocyte
     970, [0.0, 9.8552, 1.4451, 4.4407, -15.1209, -91.0733, -102.926]
     243, [0.0825850340136054, 10.2098, 1.292, 5.2199, -18.4498, -95.9609, -98.199]
(970, 6) (970,) (243, 6) (243,)


In [8]:
# load all models from optuna
# and get the hyperparameters of the best model from each study
for _type in feature_types:
    best_model_hyperparams[_type] = {}
    for benchmark in tdc_benchmarks:
        best_model_hyperparams[_type][benchmark] = {}
        for model_id in model_identifiers:
            jl = joblib.load(f"project_resources/optuna/{_type}/{benchmark}/{model_id}.pkl")
            best_model_hyperparams[_type][benchmark][model_id] = jl.best_trial.params
print(best_model_hyperparams["morgan"]["obach"]["linear"])

{'alpha': 0.042384434382086285, 'l1_ratio': 0.7398025962461235}


In [9]:
best_model_hyperparams["morgan"]["obach"]["KRR"]

{'alpha': 0.5600102049865601,
 'gamma': 2.5021513631563733e-15,
 'kernel': 'laplacian'}

In [8]:
# save retrain models with best hyperparameters and save the pre-trained models with joblib
for _type in feature_types:
    for benchmark in tdc_benchmarks:
        for model_id, model in zip(model_identifiers, models):
            # model_id ... string, name of the model
            # model ... the class of the actual model, not initialized
            
            # train the model on both the train and test datasets since the model won't be tested
            X_test = mol_features[_type][benchmark]["test"]
            y_test = halflives[_type][benchmark]["test"]
            X_train = np.concatenate((mol_features[_type][benchmark]["train"], X_test))
            y_train = np.concatenate((halflives[_type][benchmark]["train"], y_test))
            
            params = best_model_hyperparams[_type][benchmark][model_id]
            model_init = model(**params)
            model_init.fit(X_train, y_train)
            
            model_file_path = f"./project_resources/optuna/{_type}/{benchmark}/{model_id}.joblib"
            joblib.dump(model_init, model_file_path)
            
            try:
                loaded_model = joblib.load(model_file_path)
                print(_type, benchmark, model_id, " was successfully created")
            except FileNotFoundError:
                print(_type, benchmark, model_id, " wasn't created correctly")

morgan obach linear  was successfully created
morgan obach KRR  was successfully created
morgan obach GB  was successfully created
morgan obach RF  was successfully created
morgan obach ANN  was successfully created
morgan microsome linear  was successfully created
morgan microsome KRR  was successfully created
morgan microsome GB  was successfully created
morgan microsome RF  was successfully created
morgan microsome ANN  was successfully created
morgan hepatocyte linear  was successfully created
morgan hepatocyte KRR  was successfully created
morgan hepatocyte GB  was successfully created
morgan hepatocyte RF  was successfully created
morgan hepatocyte ANN  was successfully created
jazzy obach linear  was successfully created
jazzy obach KRR  was successfully created
jazzy obach GB  was successfully created
jazzy obach RF  was successfully created
jazzy obach ANN  was successfully created
jazzy microsome linear  was successfully created
jazzy microsome KRR  was successfully created
j