#### Import required libs

In [1]:
import sys
sys.path.append('/home/jupyter/app')

import pickle

import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from scikeras.wrappers import KerasRegressor


from src.io_utils import load_dataframe

#### Define constants

In [2]:
GS_DIR_FEATURES = "gs://pcqm4mv2/data/features"

#### Load datasets as pandas DataFrame

In [3]:
df_train = load_dataframe("train.csv", GS_DIR_FEATURES)

print("df_train shape:", df_train.shape)
df_train.head()

  mask |= (ar1 == a)


df_train shape: (3378606, 29)


Unnamed: 0_level_0,smiles,homolumogap,number_of_atoms,number_of_heavy_atoms,number_of_bonds,number_of_heavy_bonds,number_of_conformations,exact_mol_weight,average_mol_weight,heavy_mol_weight,...,number_of_B_atoms,number_of_C_atoms,number_of_N_atoms,number_of_O_atoms,number_of_F_atoms,number_of_Si_atoms,number_of_P_atoms,number_of_S_atoms,number_of_Cl_atoms,number_of_Br_atoms
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C,3.047675,30,18,32,20,0,236.094963,236.274,224.178,...,0,15,2,1,0,0,0,0,0,0
1,COc1cc(OC)ccc1/C=C/N(C(=O)C)C,4.410966,34,17,34,17,0,235.120843,235.283,218.147,...,0,13,1,3,0,0,0,0,0,0
2,C=CCN(C(=O)C)/C=C/c1ccccc1C,4.639541,33,16,33,16,0,215.131014,215.296,198.16,...,0,14,1,1,0,0,0,0,0,0
3,C=CCN(C(=O)C)/C=C/c1ccccc1F,4.4926,30,16,30,16,0,219.105942,219.259,205.147,...,0,13,1,1,1,0,0,0,0,0
4,C=CCN(C(=O)C)/C=C/c1ccccc1Cl,4.61233,30,16,30,16,0,235.076392,235.714,221.602,...,0,13,1,1,0,0,0,0,1,0


In [4]:
df_val = load_dataframe("valid.csv", GS_DIR_FEATURES)

print("df_val shape:", df_val.shape)
df_val.head()

df_val shape: (73545, 29)


Unnamed: 0_level_0,smiles,homolumogap,number_of_atoms,number_of_heavy_atoms,number_of_bonds,number_of_heavy_bonds,number_of_conformations,exact_mol_weight,average_mol_weight,heavy_mol_weight,...,number_of_B_atoms,number_of_C_atoms,number_of_N_atoms,number_of_O_atoms,number_of_F_atoms,number_of_Si_atoms,number_of_P_atoms,number_of_S_atoms,number_of_Cl_atoms,number_of_Br_atoms
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3378606,COc1ccccc1N[C@H](/C(=N\C(=N)O)/O)C,4.58784,32,17,32,17,0,237.111341,237.259,222.139,...,0,11,3,3,0,0,0,0,0,0
3378607,COc1ccccc1N[C@H](/C(=N\C(=N)O)/O)C,4.97152,32,17,32,17,0,237.111341,237.259,222.139,...,0,11,3,3,0,0,0,0,0,0
3378613,CC(/N=C(\N/N=C/1\C[C@H]2[C@@H]1CC=C2)/S)C,5.4967,32,15,33,16,0,223.114319,223.345,206.209,...,0,11,3,0,0,0,0,1,0,0
3378614,CC(/N=C(\N/N=C/1\C[C@H]2[C@@H]1CC=C2)/S)C,5.485815,32,15,33,16,0,223.114319,223.345,206.209,...,0,11,3,0,0,0,0,1,0,0
3378624,C/N=C(\c1cc2c(s1)ccc(c2)F)/O,4.748387,22,14,23,15,0,209.031063,209.245,201.181,...,0,10,1,1,1,0,0,1,0,0


#### Replace NaN with 0

In [5]:
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

In [6]:
print("Number of NaN in df_train:", df_train.isna().sum().sum())
print("Number of NaN in df_valn:", df_val.isna().sum().sum())

Number of NaN in df_train: 0
Number of NaN in df_valn: 0


#### Create X and y objects to train

In [7]:
X_train = df_train.drop(["smiles", "homolumogap"], axis=1).to_numpy()
y_train = df_train[["homolumogap"]].to_numpy()

print("X_train shape:", X_train.shape)
print(X_train)

print("y_train shape:", y_train.shape)
print(y_train)

X_train shape: (3378606, 27)
[[30. 18. 32. ...  0.  0.  0.]
 [34. 17. 34. ...  0.  0.  0.]
 [33. 16. 33. ...  0.  0.  0.]
 ...
 [34. 17. 34. ...  0.  0.  0.]
 [34. 17. 34. ...  0.  0.  0.]
 [35. 17. 36. ...  0.  0.  0.]]
y_train shape: (3378606, 1)
[[3.04767513]
 [4.41096552]
 [4.63954115]
 ...
 [5.06403876]
 [5.33615261]
 [5.4205079 ]]


In [8]:
X_val = df_val.drop(["smiles", "homolumogap"], axis=1).to_numpy()
y_val = df_val[["homolumogap"]].to_numpy()

print("X_val shape:", X_val.shape)
print(X_val)

print("y_val shape:", y_val.shape)
print(y_val)

X_val shape: (73545, 27)
[[32. 17. 32. ...  0.  0.  0.]
 [32. 17. 32. ...  0.  0.  0.]
 [32. 15. 33. ...  1.  0.  0.]
 ...
 [23. 13. 23. ...  0.  0.  0.]
 [ 8.  4.  7. ...  0.  2.  0.]
 [57. 34. 59. ...  0.  0.  0.]]
y_val shape: (73545, 1)
[[4.58783952]
 [4.97152005]
 [5.49669978]
 ...
 [4.95519322]
 [8.17974235]
 [3.3143467 ]]


#### Scale data with StandardScaler

In [9]:
X_scaler = StandardScaler()
y_scaler = StandardScaler()

In [10]:
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

X_val = X_scaler.transform(X_val)
y_val = y_scaler.transform(y_val)

#### Save scalers locally and to GCS

In [11]:
with open("../../artifacts/sc_x_features.pkl", "wb") as f:
    pickle.dump(X_scaler, f)
    
!gsutil cp ../../artifacts/sc_x_features.pkl gs://pcqm4mv2/artifacts/sc_x_features.pkl

Copying file://../../artifacts/sc_x_features.pkl [Content-Type=application/octet-stream]...
/ [1 files][  1.2 KiB/  1.2 KiB]                                                
Operation completed over 1 objects/1.2 KiB.                                      


In [12]:
with open("../../artifacts/sc_y.pkl", "wb") as f:
    pickle.dump(y_scaler, f)

!gsutil cp ../../artifacts/sc_y.pkl gs://pcqm4mv2/artifacts/sc_y.pkl    

Copying file://../../artifacts/sc_y.pkl [Content-Type=application/octet-stream]...
/ [1 files][  559.0 B/  559.0 B]                                                
Operation completed over 1 objects/559.0 B.                                      


#### Train Model

##### Number of features to use in input shape

In [13]:
N_FEATURES = X_train.shape[1]
N_FEATURES

27

##### Function to create model given a set of hyperparameters

In [41]:
def create_model(
    n_hidden_layers=2, 
    n_neurons=64, 
    dropout_rate=0.1,
    regularizer="l1_l2",
    input_shape=(N_FEATURES, )
):
    model = Sequential()
    model.add(Dense(n_neurons, activation="relu", input_shape=input_shape, kernel_regularizer=regularizer))
    
    for _ in range(n_hidden_layers-1):
        model.add(Dropout(dropout_rate))
        model.add(Dense(n_neurons, activation="relu", kernel_regularizer=regularizer))
    

    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation="linear"))
    model.compile(optimizer="adam", loss="mae")

    return model

##### Create a KerasRegressor given a model function

In [42]:
regressor = KerasRegressor(
    model=create_model,
    epochs=20,
    batch_size=512,
    verbose=0
)

##### Define params to search

In [43]:
param_grid = {
    "model__n_neurons": [64, 128],
    "model__n_hidden_layers": [2, 4],
    "model__regularizer": [None, "l1_l2"],
    "model__dropout_rate": [0.1, 0.2],
}

##### Instantiate a grid search to use all threads available

In [46]:
grid_search = GridSearchCV(
    regressor,
    param_grid,
    scoring="neg_mean_absolute_error",
    cv=10,
    verbose=0,
    n_jobs=40
)

##### Start training

In [47]:
np.random.seed(2912)
grid_result = grid_search.fit(X_train, y_train)

2022-08-07 16:46:03.537850: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 64. Tune using inter_op_parallelism_threads for best performance.
2022-08-07 16:46:03.588905: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 64. Tune using inter_op_parallelism_threads for best performance.
2022-08-07 16:46:03.684965: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 64. Tune using inter_op_parallelism_threads for best performance.
2022-08-07 16:46:03.724724: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 64. Tune using inter_op_parallelism_threads for best performance.
2022-08-07 16:46:03.734399: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 64. Tune using inter_op_parallelism_threads fo

#### Saving Grid Result to GCS

In [48]:
with open("../../artifacts/gs_result_3.pkl", "wb") as f:
    pickle.dump(grid_result, f)

!gsutil cp ../../artifacts/gs_result_3.pkl gs://pcqm4mv2/artifacts/gs_result_3.pkl   

INFO:tensorflow:Assets written to: ram:///tmp/tmphetf4bjb/assets
Copying file://../../artifacts/gs_result_3.pkl [Content-Type=application/octet-stream]...
/ [1 files][  1.2 MiB/  1.2 MiB]                                                
Operation completed over 1 objects/1.2 MiB.                                      


#### Evaluating best regressor

In [49]:
print("GridSearch results:")
grid_search.cv_results_

GridSearch results:


{'mean_fit_time': array([ 842.19102142,  805.4865042 , 1168.96053147, 1084.15826707,
        1417.26893034, 1287.35708599, 2156.14392848, 2039.04587224,
         850.29091148,  821.27375317, 1220.47196572, 1152.03239684,
        1389.88028336, 1255.14958854, 1530.55378506, 1447.71976225]),
 'std_fit_time': array([28.87153871, 21.50327974, 19.79048083, 25.76930084, 25.41154491,
        34.90940895, 21.46448606, 52.53145713, 22.00060757, 30.96187004,
         9.6577917 , 54.66892178, 22.87852965, 30.84251406, 12.27536237,
        14.29344787]),
 'mean_score_time': array([2.56732798, 2.52635987, 2.80017965, 2.86503687, 2.61958685,
        2.68547595, 4.59059575, 4.87168934, 2.5983675 , 2.52646902,
        2.6525651 , 2.61011147, 2.2585463 , 2.53188741, 2.01005821,
        1.86654816]),
 'std_score_time': array([0.43169454, 0.47565607, 0.19535465, 0.03375191, 0.32009729,
        0.24955483, 0.98353646, 0.82884232, 0.41768965, 0.41721163,
        0.24998725, 0.25952489, 0.44859321, 0.366663

In [50]:
grid_search.best_score_

-0.3146911222185909

In [51]:
grid_search.best_params_

{'model__dropout_rate': 0.1,
 'model__n_hidden_layers': 4,
 'model__n_neurons': 128,
 'model__regularizer': None}

In [52]:
best_regressor = grid_result.best_estimator_

In [53]:
print("Hyperparameters of best regressor:", best_regressor)

Hyperparameters of best regressor: KerasRegressor(
	model=<function create_model at 0x7fcd4f77e710>
	build_fn=None
	warm_start=False
	random_state=None
	optimizer=rmsprop
	loss=None
	metrics=None
	batch_size=512
	validation_batch_size=None
	verbose=0
	callbacks=None
	validation_split=0.0
	shuffle=True
	run_eagerly=False
	epochs=20
	model__dropout_rate=0.1
	model__n_hidden_layers=4
	model__n_neurons=128
	model__regularizer=None
)


In [54]:
best_regressor.model_.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               3584      
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 128)               16512     
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 128)              

#### Get MAE for validation set

In [55]:
y_val_pred = best_regressor.model_.predict(X_val)
y_val_pred

array([[-0.62006897],
       [-0.62006897],
       [-0.6222135 ],
       ...,
       [ 0.02568208],
       [ 1.7986993 ],
       [-1.3419119 ]], dtype=float32)

In [56]:
y_val_inv = y_scaler.inverse_transform(y_val)
y_val_pred_inv = y_scaler.inverse_transform(y_val_pred)

In [57]:
mae_result = mean_absolute_error(y_val_inv, y_val_pred_inv)
print("MAE Result:", mae_result)

MAE Result: 0.41502494412402685
