## Imports

In [None]:
import os
import pickle

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
pd.options.display.width = 0

from tensorflow.keras.layers import (
    Dense,
    Dropout,
)
from tensorflow.keras.models import Sequential

from sklearn.metrics import mean_squared_error, mean_absolute_error

## Grid search models comparison

### Import dumped grids

In [None]:
grids = [grid for grid in os.listdir('../grids') if 'grid_search' in grid]
grids

['grid_search_decision_tree_regressor_01.pkl',
 'grid_search_gradient_boosting_01.pkl',
 'grid_search_linear_regressor_01.pkl',
 'grid_search_neural_net_02.pkl',
 'grid_search_random_forest_01.pkl',
 'grid_search_svm_02.pkl']

### Load other necessary files

In [None]:
X = np.load("../data/X_train_scaled.npy", allow_pickle=True)
y = np.load("../data/y_train_scaled.npy", allow_pickle=True)

def create_model(n_neurons, dropout_rate, input_shape=(X.shape[1], )):
    model = Sequential()
    model.add(Dense(n_neurons, activation="relu", input_shape=input_shape))
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate))
    model.add(Dense(n_neurons, activation="relu"))
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation="linear"))
    model.compile(optimizer="adam", loss="mse")
    return model

### Create comparison DataFrame

In [None]:
df_grid = pd.DataFrame()

for grid in grids:
    model_type = grid[grid.index("_", 8)+1:grid.rindex("_")]
    with open(f"../grids/{grid}", mode="rb") as f:
        pkl = pickle.load(f)
        gdf = pd.DataFrame.from_dict(pkl.cv_results_)[
            ["params", "mean_test_score", "std_test_score"]
        ]
        gdf["model_type"] = model_type
        df_grid = df_grid.append(
            gdf,
            ignore_index=True
        )

df_grid = df_grid.sort_values(
    "mean_test_score",
    ascending=False
).reset_index(drop=True)

print("All models")
df_grid

All models


Unnamed: 0,params,mean_test_score,std_test_score,model_type
0,"{'dropout_rate': 0, 'n_neurons': 64}",-0.209054,0.047653,neural_net
1,"{'dropout_rate': 0, 'n_neurons': 32}",-0.229017,0.048251,neural_net
2,"{'dropout_rate': 0.3, 'n_neurons': 64}",-0.261277,0.048857,neural_net
3,"{'dropout_rate': 0.3, 'n_neurons': 32}",-0.286353,0.051733,neural_net
4,"{'bootstrap': True, 'max_depth': 12, 'n_estima...",-0.31739,0.074361,random_forest
5,"{'bootstrap': True, 'max_depth': 12, 'n_estima...",-0.321892,0.076026,random_forest
6,"{'bootstrap': False, 'max_depth': 12, 'n_estim...",-0.344578,0.084087,random_forest
7,"{'bootstrap': False, 'max_depth': 12, 'n_estim...",-0.344621,0.084135,random_forest
8,"{'max_depth': 12, 'splitter': 'best'}",-0.344641,0.084111,decision_tree_regressor
9,{},-0.388943,0.065095,linear_regressor


In [None]:
print("Best models")
df_grid.drop_duplicates('model_type')

Best models


Unnamed: 0,params,mean_test_score,std_test_score,model_type
0,"{'dropout_rate': 0, 'n_neurons': 64}",-0.209054,0.047653,neural_net
4,"{'bootstrap': True, 'max_depth': 12, 'n_estima...",-0.31739,0.074361,random_forest
8,"{'max_depth': 12, 'splitter': 'best'}",-0.344641,0.084111,decision_tree_regressor
9,{},-0.388943,0.065095,linear_regressor
11,"{'learning_rate': 0.1, 'n_estimators': 50, 'su...",-0.450435,0.083229,gradient_boosting
20,"{'C': 1, 'gamma': 0.8}",-0.562424,0.025804,svm


In [None]:
with open("../scalers/X_scaler.pkl", "rb") as f:
    X_scaler = pickle.load(f)

with open("../scalers/y_scaler.pkl", "rb") as f:
    y_scaler = pickle.load(f)

In [None]:
df_test = pd.read_csv("../data/test.csv")

X_test = X_scaler.transform(
    df_test.drop(["smiles", "homolumogap"], axis=1).to_numpy()
)
y_test = y_scaler.transform(
    df_test[["homolumogap"]].to_numpy()
)

In [None]:
df_test_result = pd.DataFrame(columns=["model_type", "mse", "mae"])

pbar = tqdm(grids)

for grid in pbar:
    model_type = grid[grid.index("_", 8)+1:grid.rindex("_")]
    pbar.set_description(f"Predicting with {model_type}...")
    
    with open(f"../grids/{grid}", mode="rb") as f:
        pkl = pickle.load(f)
    model = pkl.best_estimator_
    y_hat = model.predict(X_test)
    mse = mean_squared_error(y_test, y_hat)
    mae = mean_absolute_error(y_test, y_hat)
    df_test_result = df_test_result.append(
        {
            "model_type": model_type,
            "mse": mse,
            "mae": mae,
        },
        ignore_index=True
    )

  0%|          | 0/6 [00:00<?, ?it/s]



In [None]:
df_test_result.sort_values(["mse", "mae"]).reset_index(drop=True)

Unnamed: 0,model_type,mse,mae
0,neural_net,0.198305,0.327192
1,random_forest,0.311386,0.418187
2,decision_tree_regressor,0.333308,0.430803
3,linear_regressor,0.385191,0.47756
4,gradient_boosting,0.438995,0.509825
5,svm,0.569116,0.551528
