In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import sys
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import fp_from_smiles, parse_jazzy_df, HyperparamTuner

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
isozymes = ["3A4", "RLM", "HLC"]
splitters = ["rand", "scaff"]
data_splits = ["train", "test"]
feature_types = ["morgan", "jazzy"]
rel_paths = {
    "morgan_3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "morgan_3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "morgan_RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "morgan_RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "morgan_HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "morgan_HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",

    "morgan_3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "morgan_3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "morgan_RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "morgan_RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "morgan_HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "morgan_HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",

    "jazzy_3A4_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_train.csv",
    "jazzy_3A4_train_rand": r"project_resources/jazzy_splits/random/3A4_train.csv",
    "jazzy_RLM_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_train.csv",
    "jazzy_RLM_train_rand": r"project_resources/jazzy_splits/random/RLM_train.csv",
    "jazzy_HLC_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_train.csv",
    "jazzy_HLC_train_rand": r"project_resources/jazzy_splits/random/HLC_train.csv",

    "jazzy_3A4_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_test.csv",
    "jazzy_3A4_test_rand": r"project_resources/jazzy_splits/random/3A4_test.csv",
    "jazzy_RLM_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_test.csv",
    "jazzy_RLM_test_rand": r"project_resources/jazzy_splits/random/RLM_test.csv",
    "jazzy_HLC_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_test.csv",
    "jazzy_HLC_test_rand": r"project_resources/jazzy_splits/random/HLC_test.csv"
}
models = {}
mol_features = {}
jazzy_halflives = {}
best_model_hyperparams = {}

In [3]:
"""# Sample data
x = np.linspace(0, 10, 100)
y1 = np.sin(x)
y2 = np.cos(x)
y3 = np.sin(x) + np.cos(x)
y4 = np.sin(x) - np.cos(x)

# Create a 2x3 grid of plots
fig, axs = plt.subplots(3, 2, figsize=(10, 10))

# Plot data on each subplot
axs[0, 0].plot(x, y1)
axs[0, 0].set_title('y = sin(x)')

axs[0, 1].plot(x, y2)
axs[0, 1].set_title('y = cos(x)')

axs[1, 0].plot(x, y3)
axs[1, 0].set_title('y = sin(x) + cos(x)')

axs[1, 1].plot(x, y4)
axs[1, 1].set_title('y = sin(x) - cos(x)')

# Adjust layout
plt.tight_layout()
plt.show()"""

"# Sample data\nx = np.linspace(0, 10, 100)\ny1 = np.sin(x)\ny2 = np.cos(x)\ny3 = np.sin(x) + np.cos(x)\ny4 = np.sin(x) - np.cos(x)\n\n# Create a 2x3 grid of plots\nfig, axs = plt.subplots(3, 2, figsize=(10, 10))\n\n# Plot data on each subplot\naxs[0, 0].plot(x, y1)\naxs[0, 0].set_title('y = sin(x)')\n\naxs[0, 1].plot(x, y2)\naxs[0, 1].set_title('y = cos(x)')\n\naxs[1, 0].plot(x, y3)\naxs[1, 0].set_title('y = sin(x) + cos(x)')\n\naxs[1, 1].plot(x, y4)\naxs[1, 1].set_title('y = sin(x) - cos(x)')\n\n# Adjust layout\nplt.tight_layout()\nplt.show()"

In [4]:
# load all models from optuna
# and get the hyperparameters of the best model from each study
# doesn't get the user_attrs={'fit_intercept': True} from linear, need to get manually
for _type in feature_types:
    models[_type] = {}
    best_model_hyperparams[_type] = {}
    for splitter in splitters:
        models[_type][splitter] = {}
        best_model_hyperparams[_type][splitter] = {}
        if splitter == "rand":
            splitter_name = "random"
        else:
            splitter_name = "scaffold_splitter"
        for isozyme in isozymes:
            models[_type][splitter][isozyme] = {}
            best_model_hyperparams[_type][splitter][isozyme] = {}
            for model_identifier in model_identifiers:
                jl = joblib.load(f"project_resources/optuna/{_type}/{splitter_name}/{isozyme}/{model_identifier}.pkl")
                models[_type][splitter][isozyme][model_identifier] = jl
                best_model_hyperparams[_type][splitter][isozyme][model_identifier] = jl.best_trial.params
print(models["morgan"]["rand"]["3A4"]["linear"].best_trial)
print(best_model_hyperparams["morgan"]["rand"]["3A4"]["linear"])

FrozenTrial(number=9, state=TrialState.COMPLETE, values=[1.364301123873657], datetime_start=datetime.datetime(2023, 10, 11, 6, 59, 3, 712704), datetime_complete=datetime.datetime(2023, 10, 11, 6, 59, 4, 616147), params={'alpha': 0.0632399399346352, 'l1_ratio': 0.687379545008854}, user_attrs={'fit_intercept': True}, system_attrs={}, intermediate_values={}, distributions={'alpha': FloatDistribution(high=0.1, log=False, low=1e-05, step=None), 'l1_ratio': FloatDistribution(high=1.0, log=False, low=0.0, step=None)}, trial_id=10, value=None)
{'alpha': 0.0632399399346352, 'l1_ratio': 0.687379545008854}


In [5]:
for model_identifier in model_identifiers:
    print(best_model_hyperparams["morgan"]["rand"]["3A4"][model_identifier])

{'alpha': 0.0632399399346352, 'l1_ratio': 0.687379545008854}
{'alpha': 0.5132621967885778, 'gamma': 6.3123850372562914e-15, 'kernel': 'laplacian'}
{'n_estimators': 20, 'learning_rate': 0.3118949729393795, 'max_depth': 3}
{'n_estimators': 200, 'max_features': 'auto', 'max_depth': 5}
{'learning_rate_init': 0.04680948516148173, 'hidden_layer_sizes': [10, 10, 10]}


In [6]:
# load Jazzy features from csv files
for splitter in splitters:
    print("\n")
    print(splitter)
    mol_features[splitter] = {}
    jazzy_halflives[splitter] = {}
    for isozyme in isozymes:
        mol_features[splitter][isozyme] = {}
        jazzy_halflives[splitter][isozyme] = {}
        for split in data_splits:
            print(isozyme, split)
            df = pd.read_csv(rel_paths[f"jazzy_{isozyme}_{split}_{splitter}"])
            features, thalfs, contains_nan = parse_jazzy_df(df)
            mol_features[splitter][isozyme][split] = features
            jazzy_halflives[splitter][isozyme][split] = thalfs
            print(f"     {mol_features[splitter][isozyme][split][0]}")
            print(f"     {isozyme} mol_features {split} contains NaN: {contains_nan}")



rand
3A4 train
     56, [63, 'CC(C)(O)c1cc(F)c2c(c1)C(=O)N(Cc1ccc(Cl)cn1)[C@@]2(OCC1(O)CC1)c1ccc(Cl)cc1', 6.0, 13.1753, 1.4177, 11.3044, -7.3559, -125.8906, -119.4889]
     [  13.1753    1.4177   11.3044   -7.3559 -125.8906 -119.4889]
     3A4 mol_features train contains NaN: False
3A4 test
     14, [23, 'Cc1ncsc1-c1ccc([C@H](CC(=O)NCCCCCCNC(=O)COc2c(-c3csc(N4CCOCC4)n3)ccc(F)c2F)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)C2(F)CC2)C(C)(C)C)cc1', 0.3767, 28.4327, 3.274, 23.4872, -4.0348, -286.1545, -257.0326]
     [  28.4327    3.274    23.4872   -4.0348 -286.1545 -257.0326]
     3A4 mol_features test contains NaN: False
RLM train
     removed index 1047 corresponding to NaN
     removed index 1517 corresponding to NaN
     2022, [158, 'O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12', 30.0, 8.0486, 0.0, 5.2165, -13.5913, -63.7029, -67.7504]
     [  8.0486   0.       5.2165 -13.5913 -63.7029 -67.7504]
     RLM mol_features train contains NaN: False
RLM test
     removed index 174 corresponding to Na

In [8]:
# test models with the best hyperparams
for _type in feature_types:
    for splitter in splitters:
        for isozyme in isozymes:
            print(_type, splitter, isozyme)
            for model_identifier in model_identifiers:
                hyperparams = best_model_hyperparams[_type][splitter][isozyme][model_identifier]
                if _type == "morgan":
                    train_df = pd.read_csv(rel_paths[f"{_type}_{isozyme}_train_{splitter}"])
                    test_df = pd.read_csv(rel_paths[f"{_type}_{isozyme}_test_{splitter}"])
                    train_smiles = train_df["smiles"]
                    test_smiles = test_df["smiles"]
                    X_train = np.array(fp_from_smiles(train_smiles))
                    X_test = np.array(fp_from_smiles(test_smiles))
                    y_train = np.log(train_df["half-life"])
                    y_test = np.log(test_df["half-life"])

                else:
                    X_train = mol_features[splitter][isozyme]["train"]
                    X_test = mol_features[splitter][isozyme]["test"]
                    y_train = np.log(jazzy_halflives[splitter][isozyme]["train"])
                    y_test = np.log(jazzy_halflives[splitter][isozyme]["test"])

                if model_identifier == 'linear':
                    alpha = hyperparams["alpha"]
                    l1_ratio = hyperparams["l1_ratio"]
                    reg = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

                if model_identifier == 'KRR':
                    alpha = hyperparams["alpha"]
                    gamma = hyperparams["gamma"]
                    kernel = hyperparams["kernel"]
                    reg = KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel)

                if model_identifier == 'GB':
                    n_estimators = hyperparams["n_estimators"]
                    learning_rate = hyperparams["learning_rate"]
                    max_depth = hyperparams["max_depth"]
                    reg = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

                if model_identifier == 'RF':
                    n_estimators = hyperparams["n_estimators"]
                    max_features = hyperparams["max_features"]
                    max_depth = hyperparams["max_depth"]
                    reg = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)

                if model_identifier == 'ANN':
                    learning_rate_init = hyperparams["learning_rate_init"]
                    hidden_layer_sizes = hyperparams["hidden_layer_sizes"]
                    reg = MLPRegressor(learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes)

                # create an instance of HyperparamTuner without specifying any model_identifier
                tuner = HyperparamTuner("foo", X_train, y_train, X_test, y_test)
                # and use the train_test_return function with return_predictions to get the rmsd values
                # predicted y_test
                rmsd, y_test_predictions = tuner.train_test_return("foo", reg, return_predictions=True)
                print(rmsd, y_test_predictions[:5])

morgan rand 3A4
1.2857487557428078 [-1.4694224  -1.03506028 -0.89106826 -0.17932767 -2.78990403]
1.4114348872195646 [-1.20372842 -1.20372842 -1.20372842 -1.20372842 -1.20372842]
1.3328394398136714 [-2.47375561 -0.14097279 -0.18787205 -0.74255475 -3.71163305]
1.169138802272098 [-1.6126433  -0.87670131 -0.8715948  -0.75624362 -2.74935922]
1.4755620853946738 [-3.36725571  0.92039323  1.24223608 -0.91297179 -4.31224981]
morgan rand RLM
1.0018893254892416 [1.09082531 1.2159848  2.40592875 2.03301511 2.06940661]
1.0979503618083273 [2.233246 2.233246 2.233246 2.233246 2.233246]
1.011215667114536 [1.33854168 1.15073886 2.35780118 2.10679538 1.85293801]
0.9834522691097085 [1.13572536 1.31865999 2.69765453 2.27306527 2.35845746]
1.0619800558703765 [1.17325291 1.29438205 2.4971573  2.10547866 2.39452888]
morgan rand HLC
0.5806007919976338 [4.17953245 4.57121123 3.45158094 4.03459622 4.7013859 ]
0.6327266265591401 [4.40033182 4.40033182 4.40033182 4.40033182 4.40033182]
0.7068910806664718 [4.75615