In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import sys
import plotly.subplots as sp
import plotly.graph_objects as go
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import fp_from_smiles, parse_jazzy_df, HyperparamTuner, tanimoto

In [23]:
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
isozymes = ["3A4", "RLM", "HLC"]
splitters = ["rand", "scaff", "time"]
data_splits = ["train", "test"]
feature_types = ["morgan", "jazzy"]
rel_paths = {
    "morgan_3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "morgan_3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "morgan_3A4_train_time": r"project_resources/base_splits/time_split/3A4_train.csv",
    "morgan_RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "morgan_RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "morgan_RLM_train_time": r"project_resources/base_splits/time_split/RLM_train.csv",
    "morgan_HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "morgan_HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "morgan_HLC_train_time": r"project_resources/base_splits/time_split/HLC_train.csv",

    "morgan_3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "morgan_3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "morgan_3A4_test_time": r"project_resources/base_splits/time_split/3A4_test.csv",
    "morgan_RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "morgan_RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "morgan_RLM_test_time": r"project_resources/base_splits/time_split/RLM_test.csv",
    "morgan_HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "morgan_HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",
    "morgan_HLC_test_time": r"project_resources/base_splits/time_split/HLC_test.csv",

    "jazzy_3A4_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_train.csv",
    "jazzy_3A4_train_rand": r"project_resources/jazzy_splits/random/3A4_train.csv",
    "jazzy_3A4_train_time": r"project_resources/jazzy_splits/time_split/3A4_train.csv",
    "jazzy_RLM_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_train.csv",
    "jazzy_RLM_train_rand": r"project_resources/jazzy_splits/random/RLM_train.csv",
    "jazzy_RLM_train_time": r"project_resources/jazzy_splits/time_split/RLM_train.csv",
    "jazzy_HLC_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_train.csv",
    "jazzy_HLC_train_rand": r"project_resources/jazzy_splits/random/HLC_train.csv",
    "jazzy_HLC_train_time": r"project_resources/jazzy_splits/time_split/HLC_train.csv",

    "jazzy_3A4_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_test.csv",
    "jazzy_3A4_test_rand": r"project_resources/jazzy_splits/random/3A4_test.csv",
    "jazzy_3A4_test_time": r"project_resources/jazzy_splits/time_split/3A4_test.csv",
    "jazzy_RLM_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_test.csv",
    "jazzy_RLM_test_rand": r"project_resources/jazzy_splits/random/RLM_test.csv",
    "jazzy_RLM_test_time": r"project_resources/jazzy_splits/time_split/RLM_test.csv",
    "jazzy_HLC_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_test.csv",
    "jazzy_HLC_test_rand": r"project_resources/jazzy_splits/random/HLC_test.csv",
    "jazzy_HLC_test_time": r"project_resources/jazzy_splits/time_split/HLC_test.csv"
}
smiles = {}
halflives = {}
models = {}
mol_features = {}
best_model_hyperparams = {}
tanimoto_sims = {}
fingerprints = {}
y_predicted = {}
rmsds = {}
stds = {}
best_models = {}

In [26]:
with open("project_resources/optuna/morgan/random/3A4/ANN.pkl", "rb") as f:
    print(f.read())

b'\x80\x04\x95o*\x00\x00\x00\x00\x00\x00\x8c\x12optuna.study.study\x94\x8c\x05Study\x94\x93\x94)\x81\x94}\x94(\x8c\nstudy_name\x94\x8c\x03ANN\x94\x8c\t_study_id\x94K\x00\x8c\x08_storage\x94\x8c optuna.storages._journal.storage\x94\x8c\x0eJournalStorage\x94\x93\x94)\x81\x94}\x94\x8c\x08_backend\x94\x8c\x1doptuna.storages._journal.file\x94\x8c\x12JournalFileStorage\x94\x93\x94)\x81\x94}\x94(\x8c\n_file_path\x94\x8c&./project_resources/optuna/journal.log\x94\x8c\x05_lock\x94h\x0f\x8c\x13JournalFileOpenLock\x94\x93\x94)\x81\x94}\x94(\x8c\n_lock_file\x94\x8c\x12./journal.log.lock\x94\x8c\x11_lock_rename_file\x94\x8c=./journal.log.lock4dced029-670b-464f-a982-ca752f9157e2.rename\x94ub\x8c\x12_log_number_offset\x94}\x94(K\x00K\x00K\x01K{K\x02M\r\x01K\x03M\x9f\x01K\x04M1\x02K\x05M\xc3\x02K\x06MU\x03K\x07M\x8d\x04K\x08M\xc7\x05K\tM\x00\x07K\nMa\x08K\x0bM\x98\tK\x0cM\xd2\nK\rM3\x0cK\x0eM\x94\rK\x0fM\xf5\x0eK\x10MV\x10K\x11M\x17\x11K\x12M\xd9\x11K\x13M\x9a\x12K\x14M[\x13K\x15M\x1c\x14K\x16M\x97\x1

In [25]:
jl = joblib.load("project_resources/optuna/morgan/random/3A4/ANN.pkl")

TypeError: __new__() takes 7 positional arguments but 8 were given

In [24]:
# load all models from optuna
# and get the hyperparameters of the best model from each study
# doesn't get the user_attrs={'fit_intercept': True} from linear, need to get manually
for _type in feature_types:
    models[_type] = {}
    best_model_hyperparams[_type] = {}
    for splitter in splitters:
        models[_type][splitter] = {}
        best_model_hyperparams[_type][splitter] = {}
        if splitter == "rand":
            splitter_name = "random"
        elif splitter == "scaff":
            splitter_name = "scaffold_splitter"
        else:
            splitter_name = "time_split"
        for isozyme in isozymes:
            models[_type][splitter][isozyme] = {}
            best_model_hyperparams[_type][splitter][isozyme] = {}
            for model_id in model_identifiers:
                jl = joblib.load(f"project_resources/optuna/{_type}/{splitter_name}/{isozyme}/{model_id}.pkl")
                models[_type][splitter][isozyme][model_id] = jl
                best_model_hyperparams[_type][splitter][isozyme][model_id] = jl.best_trial.params
print(models["morgan"]["rand"]["3A4"]["linear"].best_trial)
print(best_model_hyperparams["morgan"]["rand"]["3A4"]["linear"])

TypeError: __new__() takes 7 positional arguments but 8 were given

In [None]:
for model_id in model_identifiers:
    print(best_model_hyperparams["morgan"]["rand"]["3A4"][model_id])

In [None]:
# load smiles used for ML with Morgan features
smiles["morgan"] = {}
halflives["morgan"] = {}
for splitter in splitters:
    smiles["morgan"][splitter] = {}
    halflives["morgan"][splitter] = {}
    for isozyme in isozymes:
        smiles["morgan"][splitter][isozyme] = {}
        halflives["morgan"][splitter][isozyme] = {}
        for split in data_splits:
            df = pd.read_csv(rel_paths[f"morgan_{isozyme}_{split}_{splitter}"])
            df_smiles = list(df["smiles"])
            df_halflives = list(df["half-life"])
            smiles["morgan"][splitter][isozyme][split] = df_smiles
            halflives["morgan"][splitter][isozyme][split] = df_halflives
print("successfully loaded smiles for Morgan fingerprints")

In [None]:
# load Jazzy features from csv files and their corresponding smiles
smiles["jazzy"] = {}
halflives["jazzy"] = {}
for splitter in splitters:
    mol_features[splitter] = {}
    smiles["jazzy"][splitter] = {}
    halflives["jazzy"][splitter] = {}
    for isozyme in isozymes:
        mol_features[splitter][isozyme] = {}
        smiles["jazzy"][splitter][isozyme] = {}
        halflives["jazzy"][splitter][isozyme] = {}
        for split in data_splits:
            df = pd.read_csv(rel_paths[f"jazzy_{isozyme}_{split}_{splitter}"])
            jazzy_smiles, features, thalfs, contains_nan = parse_jazzy_df(df)
            smiles["jazzy"][splitter][isozyme][split] = jazzy_smiles
            mol_features[splitter][isozyme][split] = features
            halflives["jazzy"][splitter][isozyme][split] = thalfs
print("successfully loaded Jazzy features and their smiles")

In [None]:
# smiles to Morgan fingerprint
for _type in feature_types:
    fingerprints[_type] = {}  # need to destinguish between Jazzy and Morngan since Jazzy ommits some mols
    for splitter in splitters:
        fingerprints[_type][splitter] = {}
        for isozyme in isozymes:
            fingerprints[_type][splitter][isozyme] = {}
            for data_split in data_splits:
                fps = fp_from_smiles(smiles[_type][splitter][isozyme][data_split])
                fingerprints[_type][splitter][isozyme][data_split] = np.array(fps)
print("successfully generated Morgan fingerprints")

In [None]:
# test models with the best hyperparams
for _type in feature_types:
    y_predicted[_type] = {}
    rmsds[_type] = {}
    stds[_type] = {}
    best_models[_type] = {}
    for splitter in splitters:
        y_predicted[_type][splitter] = {}
        rmsds[_type][splitter] = {}
        stds[_type][splitter] = {}
        best_models[_type][splitter] = {}
        for isozyme in isozymes:
            y_predicted[_type][splitter][isozyme] = {}
            rmsds[_type][splitter][isozyme] = {}
            stds[_type][splitter][isozyme] = {}
            group_rmsds = {}
            print(_type, splitter, isozyme)
            for model_id in model_identifiers:
                hyperparams = best_model_hyperparams[_type][splitter][isozyme][model_id]
                if _type == "morgan":
                    X_train = fingerprints["morgan"][splitter][isozyme]["train"]
                    X_test = fingerprints["morgan"][splitter][isozyme]["test"]
                else:
                    X_train = mol_features[splitter][isozyme]["train"]
                    X_test = mol_features[splitter][isozyme]["test"]
                y_train = np.array(halflives[_type][splitter][isozyme]["train"])
                y_test = np.array(halflives[_type][splitter][isozyme]["test"])

                if model_id == 'linear':
                    alpha = hyperparams["alpha"]
                    l1_ratio = hyperparams["l1_ratio"]
                    reg = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True)

                if model_id == 'KRR':
                    alpha = hyperparams["alpha"]
                    gamma = hyperparams["gamma"]
                    kernel = hyperparams["kernel"]
                    reg = KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel)

                if model_id == 'GB':
                    n_estimators = hyperparams["n_estimators"]
                    learning_rate = hyperparams["learning_rate"]
                    max_depth = hyperparams["max_depth"]
                    reg = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

                if model_id == 'RF':
                    n_estimators = hyperparams["n_estimators"]
                    max_features = hyperparams["max_features"]
                    max_depth = hyperparams["max_depth"]
                    reg = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)

                if model_id == 'ANN':
                    learning_rate_init = hyperparams["learning_rate_init"]
                    hidden_layer_sizes = hyperparams["hidden_layer_sizes"]
                    reg = MLPRegressor(learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes)

                # create an instance of HyperparamTuner without specifying any model_identifier
                tuner = HyperparamTuner("foo", X_train, y_train, X_test, y_test)
                # and use the train_test_return function with return_predictions to get the rmsd values
                # average over multiple runs of the same model
                runs_rmsds = []
                runs_y_test_predictions = []
                runs_stds = []
                for i in range(10):
                    rmsd, y_test_predictions, std = tuner.train_test_return("foo", reg, return_predictions=True)
                    runs_rmsds.append(rmsd)
                    runs_y_test_predictions.append(y_test_predictions)
                    runs_stds.append(std)
                mean_rmsd = np.mean(runs_rmsds, axis=0)
                mean_y_test_predictions = np.mean(runs_y_test_predictions, axis=0)
                mean_stds = np.mean(runs_stds, axis=0)
                group_rmsds[mean_rmsd] = model_id
                rmsds[_type][model_id] = mean_rmsd
                y_predicted[_type][model_id] = mean_y_test_predictions
                stds[_type][model_id] = mean_stds
                print(mean_rmsd, f"y_test predictions: {y_test_predictions[:5]}, {len(y_test_predictions)}")
                print(f"     standard deviations: {mean_stds[:4]}, {len(mean_stds)}")

            # find best model for each dataset and its rmsd
            min_rmsd = min(group_rmsds.keys())
            best_model = group_rmsds[min_rmsd]
            best_models[_type] = (best_model, min_rmsd)
            print(f"best was {best_model} with rmsd {min_rmsd}")

            print("\n")

In [None]:
for _type in feature_types:
    tanimoto_sims[_type] = {}
    for splitter in splitters:
        print(splitter)
        tanimoto_sims[_type][splitter] = {}
        for isozyme in isozymes:
            tanimoto_similarities = tanimoto(fingerprints[_type][splitter][isozyme]["test"], fingerprints[_type][splitter][isozyme]["train"])
            tanimoto_sims[_type][splitter][isozyme] = tanimoto_similarities
            median = np.median(tanimoto_similarities)
            mean = np.mean(tanimoto_similarities)
            print(f"length: {len(tanimoto_similarities)}, median: {median}, arithmetic mean: {mean}, ",
                  tanimoto_sims[_type][splitter][isozyme][:10])

In [None]:
# Create a 20x3 grid of subplots
fig, axs = plt.subplots(6, 3, figsize=(20, 30))

plot_counter = 0
# Loop through each subplot and plot different data
for _type in feature_types:
    for splitter in splitters:
        for isozyme in isozymes:
            model_id = best_models[_type][splitter][isozyme][0]
            plot_x = tanimoto_sims[_type][splitter][isozyme]
            plot_y = y_predicted[_type][splitter][isozyme][model_id]

            # Get the current axis based on the plot counter
            ax = axs[plot_counter // 3, plot_counter % 3]

            # Plot the data and customize the subplot
            ax.scatter(plot_x, plot_y)
            ax.set_title(f'{_type}, {splitter}, {isozyme}, {model_id}')
            ax.set(xlabel='tanimoto similarity', ylabel='prediction error')

            plot_counter += 1

plt.tight_layout()  # Ensure the subplots are properly laid out
plt.show()

In [None]:
_type = "morgan"
subplot_titles = []
# generate plot titles
for splitter in splitters:
    for isozyme in isozymes:
        model_id = best_models[_type][splitter][isozyme][0]
        title = f'{_type}, {splitter}, {isozyme}, {model_id}'
        subplot_titles.append(title)

# Create a 20x3 grid of subplots
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=subplot_titles)

plot_counter = 0
# Loop through each subplot and plot different data

for splitter in splitters:
    for isozyme in isozymes:
        model_id = best_models[_type][splitter][isozyme][0]
        plot_x = halflives[_type][splitter][isozyme]["test"]
        plot_y = y_predicted[_type][splitter][isozyme][model_id]
        std = stds[_type][splitter][isozyme][model_id]

        # Plot the data
        scatter_trace = go.Scatter(
            x=plot_x,
            y=plot_y,
            mode='markers',
            error_y=dict(type='data', array=std, visible=True),
            name=f'{_type}, {splitter}, {isozyme}, {model_id}'
        )

        # Add the diagonal line
        diagonal_trace = go.Scatter(
            x=plot_x,
            y=plot_x,
            mode='lines',
            line=dict(color='orange', dash='dash'),
            showlegend=False
        )

        # Add the traces to the subplot
        fig.add_trace(scatter_trace, row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)
        fig.add_trace(diagonal_trace, row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)

        # Customize the subplot
        fig.update_xaxes(title_text='real natural log half-life', row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)
        fig.update_yaxes(title_text='predicted natural log half-life', row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)

        plot_counter += 1

# Update layout
fig.update_layout(height=1500, title_text="Subplots with Plotly")
fig.show()

In [None]:
_type = "jazzy"
subplot_titles = []
# generate plot titles
for splitter in splitters:
    for isozyme in isozymes:
        model_id = best_models[_type][splitter][isozyme][0]
        title = f'{_type}, {splitter}, {isozyme}, {model_id}'
        subplot_titles.append(title)

# Create a 20x3 grid of subplots
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=subplot_titles)

plot_counter = 0
# Loop through each subplot and plot different data

for splitter in splitters:
    for isozyme in isozymes:
        model_id = best_models[_type][splitter][isozyme][0]
        plot_x = halflives[_type][splitter][isozyme]["test"]
        plot_y = y_predicted[_type][splitter][isozyme][model_id]
        std = stds[_type][splitter][isozyme][model_id]

        # Plot the data
        scatter_trace = go.Scatter(
            x=plot_x,
            y=plot_y,
            mode='markers',
            error_y=dict(type='data', array=std, visible=True),
            name=f'{_type}, {splitter}, {isozyme}, {model_id}'
        )

        # Add the diagonal line
        diagonal_trace = go.Scatter(
            x=plot_x,
            y=plot_x,
            mode='lines',
            line=dict(color='orange', dash='dash'),
            showlegend=False
        )

        # Add the traces to the subplot
        fig.add_trace(scatter_trace, row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)
        fig.add_trace(diagonal_trace, row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)

        # Customize the subplot
        fig.update_xaxes(title_text='real natural log half-life', row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)
        fig.update_yaxes(title_text='predicted natural log half-life', row=plot_counter // 3 + 1, col=plot_counter % 3 + 1)

        plot_counter += 1

# Update layout
fig.update_layout(height=1500, title_text="Subplots with Plotly")
fig.show()

In [None]:
for _type in feature_types:
    bar_width = 0.25

    r1 = np.arange(len(splitters))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]

    grouped_rmsds = {}
    for splitter in splitters:
        grouped_rmsds[splitter] = []
        for isozyme in isozymes:
            grouped_rmsds[splitter].append(best_models[_type][splitter][isozyme][1])

    plt.bar(r1, grouped_rmsds["rand"], width=bar_width, label='Bar 1')
    plt.bar(r2, grouped_rmsds["scaff"], width=bar_width, label='Bar 2')
    plt.bar(r3, grouped_rmsds["time"], width=bar_width, label='Bar 3')

    plt.xlabel('splitters')
    plt.ylabel('rmsd')
    plt.title(f'{_type} root mean square deviations')
    plt.xticks([r + bar_width for r in range(len(splitters))], ["random", "scaffold", "publication date"])
    plt.legend(isozymes)

    plt.show()