In [None]:
%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

from joblib import load
from ruamel.yaml import YAML
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import h5py

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, GridSearchCV, ShuffleSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.gaussian_process import GaussianProcessRegressor, kernels
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.utils import resample

from umda import paths
from umda.data import load_data, load_pipeline
from umda import training

In [None]:
seed = 1215677
normalize = False
mask = False

In [None]:
state = np.random.default_rng(seed)

full_X, full_cluster_ids, tmc1_df = load_data()
embedder = load_pipeline()

tmc1_X = np.vstack([embedder.vectorize(smi) for smi in tmc1_df["SMILES"]])
tmc1_y = np.log10(tmc1_df["Column density (cm^-2)"].to_numpy())

In [None]:
with open("model_hparams.yml") as read_file:
    hparams = YAML().load(read_file)

In [None]:
gp_kernel = kernels.ConstantKernel() * \
    kernels.RBF(3.0, (1e-1, 10.0)) + \
    kernels.RationalQuadratic(200.0, 20.0, alpha_bounds=(1e-3, 5e2), length_scale_bounds=(50.0, 1e4)) * \
        kernels.ConstantKernel() + kernels.ConstantKernel()

In [None]:
base_models = {
    "linear_regression": LinearRegression(),
    "ridge": Ridge(),
    "br": BayesianRidge(),
    "svr": SVR(),
    "knn": KNeighborsRegressor(),
    "rfr": RandomForestRegressor(random_state=seed),
    "gbr": GradientBoostingRegressor(random_state=seed),
    "gpr": GaussianProcessRegressor(
        kernel=gp_kernel, random_state=seed
    )
}

models = {key: training.compose_model(value, normalize) for key, value in base_models.items()}

In [None]:
# generalized workflow
model_results = dict()
best_models = dict()
cv_results = dict()
for name in models.keys():
    model = models.get(name)
    print(f"Working on {name} now.")
    hparam = hparams.get(name)
    cv_grid = training.grid_cv_search((tmc1_X, tmc1_y), model, hparam, seed, verbose=1, n_splits=20, scoring="neg_mean_squared_error")
    print(f"Model: {name} best CV score: {cv_grid.best_score_:.4e}")
    # 
    best_estimator, best_train, best_test, best_performance, best_index, log = training.standardized_fit_test((tmc1_X, tmc1_y), model, cv_grid.best_params_, seed, n_splits=20)
    model_results[name] = log
    best_models[name] = best_estimator
    cv_results[name] = cv_grid

In [None]:
# export the cross-validation results
for name in models.keys():
    df = pd.DataFrame(cv_results[name].cv_results_)
    keys = ["mean_test_score", "rank_test_score"]
    keys.extend([key for key in df.keys() if "param_" in key])
    df = df[keys]
    # sort and reset the indices
    df.sort_values("rank_test_score", ascending=True, inplace=True)
    df.reset_index(inplace=True, drop=True)
    # dump to CSV file
    if normalize:
        flags = "norm"
    else:
        flags = "unnorm"
    if mask:
        flags += "_mask"
    else:
        flags += "_nomask"
    df.to_csv(f"outputs/grid_search/{name}_{flags}.csv", index=False)

In [None]:
cv_results["ridge"].cv_results_.keys()

## Exporting the hyperparameter optimization results

In [None]:
# collect up the dictionaries for best parameters
best_param_dict = dict()
for name in models.keys():
    best_param_dict[name] = cv_results[name].best_params_
with open("outputs/grid_search/optimized_hparams.yml", "w+") as write_file:
    YAML().dump(best_param_dict, write_file)

## Writing the training reports

In [None]:
best_splits = dict()

for name, log in model_results.items():
    df = pd.DataFrame(log).sort_values(["performance", "r2"], ascending=[True, False])
    df.reset_index(inplace=True, drop=True)
    df.to_csv(f"outputs/grid_search/{name}_training_report.csv", index=False)
    best_splits[name] = (df.iloc[0]["train_index"], df.iloc[0]["test_index"])

## Making an overview plot

In [None]:
num_models = len(hparams)
formatted_names = {key: key.upper() for key in models.keys()}
formatted_names["linear_regression"] = "LR"
formatted_names["ridge"] = "RR"

fig, axarray = plt.subplots(2, num_models // 2, figsize=(10, 5), sharex=True, sharey=True)

for model_name, ax in zip(models.keys(), axarray.flatten()):
    model = best_models.get(model_name)
    train_split, test_split = best_splits.get(model_name)
    # draw the ideal curve
    ax.plot(np.arange(10, 16), np.arange(10, 16), ls="--", alpha=0.4, color="k")
    ax.scatter(tmc1_y[train_split], model.predict(tmc1_X[train_split]), c="#6B9A9B", label="Train", s=10,)
    ax.scatter(tmc1_y[test_split], model.predict(tmc1_X[test_split]), c="#E6AD39", label="Holdout", s=10,)
    r2 = r2_score(tmc1_y, model.predict(tmc1_X))
    ax.set(xlim=[10, 15], ylim=(10, 15))
    real_name = formatted_names.get(model_name)
    ax.set_title(f"{real_name} - $R^2$: {r2:1.2f}", loc="left")
    if model_name == "linear_regression":
        ax.legend()
fig.tight_layout()

## Data importance estimation

In [None]:
def bootstrap_importance_estimation(estimator, data, seed: int, n_splits: int = 500):
    X, y = data
    splitter = ShuffleSplit(n_splits, test_size=0.2, random_state=seed)
    log = list()
    weights = np.ones((n_splits, y.size))
    test_errors = list()
    for split_index, (train_index, test_index) in enumerate(splitter.split(X, y)):
        train_X, test_X, train_y, test_y = X[train_index], X[test_index], y[train_index], y[test_index]
        result = estimator.fit(train_X, train_y)
        # compute the mean squared error
        train_error = mean_squared_error(train_y, result.predict(train_X))
        test_error = mean_squared_error(test_y, result.predict(test_X))
        log.append(
            {"train_error": train_error, "test_error": test_error, "train_index": train_index, "test_index": test_index}
        )
        test_errors.append(test_error)
        weights[split_index, test_index] = 0.
    # reshape so we can do matrix multiplication
    test_errors = np.asarray(test_errors)[:,None]
    molecule_weights = (weights * test_errors).std(axis=0)
    molecule_weights /= np.min(molecule_weights)
    return log, molecule_weights

In [None]:
bootstrap_log, weights = bootstrap_importance_estimation(best_models["ridge"], (tmc1_X, tmc1_y) ,seed, n_splits=5000)

In [None]:
from sklearn.utils import resample

In [None]:
resample(tmc1_X, tmc1_y, n_samples=500, random_state=seed)[0]