In [1]:
import sys
import joblib
import numpy as np
import pandas as pd
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import load_ml_data

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [4]:
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
feature_types = ["morgan", "jazzy"]
models = [ElasticNet, KernelRidge, GradientBoostingRegressor, RandomForestRegressor, MLPRegressor]

mol_features = {}
halflives = {}
best_model_hyperparams = {}

In [5]:
for _type in feature_types:
    mol_features[_type] = {}
    halflives[_type] = {}
    for benchmark in tdc_benchmarks:
        
        X_train, y_train, X_val, y_val, X_test, y_test = load_ml_data(_type, benchmark)
        
        # train models the train, valid and test datasets since the model won't be tested
        X = np.concatenate((X_train, X_val, X_test))
        y = np.concatenate((y_train, y_val, y_test))
        
        mol_features[_type][benchmark] = X
        halflives[_type][benchmark] = y

Found local copy...
Loading...
Done!
100%|███████████████████████████████████████████████████████████████████████████████| 667/667 [00:00<00:00, 873.65it/s]
Found local copy...
Loading...
Done!
100%|█████████████████████████████████████████████████████████████████████████████| 1102/1102 [00:01<00:00, 973.68it/s]
Found local copy...
Loading...
Done!
100%|█████████████████████████████████████████████████████████████████████████████| 1213/1213 [00:01<00:00, 925.51it/s]


In [6]:
# load all models from optuna
# and get the hyperparameters of the best model from each study
for _type in feature_types:
    best_model_hyperparams[_type] = {}
    for benchmark in tdc_benchmarks:
        best_model_hyperparams[_type][benchmark] = {}
        for model_id in model_identifiers:
            jl = joblib.load(f"project_resources/optuna/{_type}/{benchmark}/{model_id}.pkl")
            best_model_hyperparams[_type][benchmark][model_id] = jl.best_trial.params
print(best_model_hyperparams["morgan"]["obach"]["linear"])

{'alpha': 0.10339725884015408, 'l1_ratio': 0.023918865219195163}


In [7]:
# save retrain models with best hyperparameters and save the pre-trained models with joblib
for _type in feature_types:
    for benchmark in tdc_benchmarks:
        for model_id, model in zip(model_identifiers, models):
            # model_id ... string, name of the model
            # model ... the class of the actual model, not initialized
            
            X = mol_features[_type][benchmark]
            y = halflives[_type][benchmark]
            
            params = best_model_hyperparams[_type][benchmark][model_id]
            model_init = model(**params)
            model_init.fit(X, y)
            
            model_file_path = f"./project_resources/optuna/{_type}/{benchmark}/{model_id}.joblib"
            joblib.dump(model_init, model_file_path)
            
            try:
                loaded_model = joblib.load(model_file_path)
                print(_type, benchmark, model_id, " was successfully created")
            except FileNotFoundError:
                print(_type, benchmark, model_id, " wasn't created correctly")

morgan obach linear  was successfully created
morgan obach KRR  was successfully created
morgan obach GB  was successfully created
morgan obach RF  was successfully created
morgan obach ANN  was successfully created
morgan microsome linear  was successfully created
morgan microsome KRR  was successfully created
morgan microsome GB  was successfully created
morgan microsome RF  was successfully created
morgan microsome ANN  was successfully created
morgan hepatocyte linear  was successfully created
morgan hepatocyte KRR  was successfully created
morgan hepatocyte GB  was successfully created
morgan hepatocyte RF  was successfully created
morgan hepatocyte ANN  was successfully created
jazzy obach linear  was successfully created
jazzy obach KRR  was successfully created
jazzy obach GB  was successfully created
jazzy obach RF  was successfully created
jazzy obach ANN  was successfully created
jazzy microsome linear  was successfully created
jazzy microsome KRR  was successfully created
j