#### Import required libs

In [None]:
import sys
sys.path.append('/home/jupyter/app')

import pickle

import numpy as np

from sklearn.preprocessing import StandardScaler

from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from src.io_utils import load_dataframe

#### Define constants

In [None]:
GS_DIR_FEATURES = "gs://pcqm4mv2/data/features"

#### Load datasets as pandas DataFrame

In [None]:
df_train = load_dataframe("train.csv", GS_DIR_FEATURES)

print("df_train shape:", df_train.shape)
df_train.head()

In [None]:
df_val = load_dataframe("valid.csv", GS_DIR_FEATURES)

print("df_val shape:", df_val.shape)
df_val.head()

#### Replace NaN with 0

In [None]:
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

In [None]:
print("Number of NaN in df_train:", df_train.isna().sum().sum())
print("Number of NaN in df_valn:", df_val.isna().sum().sum())

#### Create X and y objects to train

In [None]:
X_train = df_train.drop(["smiles", "homolumogap"], axis=1).to_numpy()
y_train = df_train[["homolumogap"]].to_numpy()

print("X_train shape:", X_train.shape)
print(X_train)

print("y_train shape:", y_train.shape)
print(y_train)

In [None]:
X_val = df_val.drop(["smiles", "homolumogap"], axis=1).to_numpy()
y_val = df_val[["homolumogap"]].to_numpy()

print("X_val shape:", X_val.shape)
print(X_val)

print("y_val shape:", y_val.shape)
print(y_val)

#### Scale data with StandardScaler

In [None]:
X_scaler = StandardScaler()
y_scaler = StandardScaler()

In [None]:
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

X_val = X_scaler.transform(X_val)
y_val = y_scaler.transform(y_val)

#### Save scalers locally and to GCS

In [None]:
with open("../../artifacts/sc_x_features.pkl", "wb") as f:
    pickle.dump(X_scaler, f)
    
!gsutil cp ../../artifacts/sc_x_features.pkl gs://pcqm4mv2/artifacts/sc_x_features.pkl

In [None]:
with open("../../artifacts/sc_y.pkl", "wb") as f:
    pickle.dump(y_scaler, f)

!gsutil cp ../../artifacts/sc_y.pkl gs://pcqm4mv2/artifacts/sc_y.pkl    

#### Train Model

##### Number of features to use in input shape

In [None]:
N_FEATURES = X_train.shape[1]
N_FEATURES

##### Function to create model given a set of hyperparameters

In [None]:
def create_model(
    n_hidden_layers=1, 
    n_neurons=32, 
    dropout_rate=0,
    dropout_last_layer=False,
    activation_function="relu",
    input_shape=(N_FEATURES, )
):
    model = Sequential()
    model.add(Dense(n_neurons, activation=activation_function, input_shape=input_shape))
    
    for _ in range(n_hidden_layers-1):
        if dropout_rate:
            model.add(Dropout(dropout_rate))
        model.add(Dense(n_neurons, activation=activation_function))
    
    if dropout_last_layer and dropout_rate:
        model.add(Dropout(dropout_rate))

    model.compile(optimizer="adam", loss="mae", metrics=["mae", "mse"])

    return model

##### Create a KerasRegressor given a model function

In [None]:
regressor = KerasRegressor(
    model=create_model,
    epochs=30,
    batch_size=512
)

##### Define params to search

In [None]:
param_grid = {
    "model__n_neurons": [32, 64, 128],
    "model__n_hidden_layers": [2, 3, 4],
    "model__dropout_rate": [0, 0.1, 0.2],
    "model__dropout_last_layer": [True, False],
    "model__activation_function": ["relu", "sigmoid", "tanh"],
}

##### Instantiate a grid search to use all threads available

In [None]:
grid_search = GridSearchCV(
    regressor,
    param_grid,
    scoring="neg_mean_absolute_error",
    cv=5,
    verbose=1,
    n_jobs=64
)

##### Start training

In [None]:
np.random.seed(2907)
grid_result = grid_search.fit(X_train, y_train)