<a href="https://colab.research.google.com/github/jregio/CSE-5835-Final-Project/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keras-tuner --upgrade

In [41]:
# connect notebook to Google Drive
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/CSE5835

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CSE5835


In [42]:
import pandas as pd

# split dataframe into features (fingerprint) and target variable (potency)
def splitXY(fileName):
  X = pd.read_pickle(fileName)
  y = pd.to_numeric(X['standard_value'])
  X.drop('standard_value', axis=1, inplace=True)

  return X, y

In [43]:
X_train, y_train = splitXY('train_df.pkl')
X_dev, y_dev = splitXY('dev_df.pkl')
X_test, y_test = splitXY('test_df.pkl')

In [44]:
import tensorflow as tf
from tensorflow import keras

# wrapper function for model tuning
def createModel(hp):
  # hypperparameters for tuning
  hp_units = hp.Int('units', min_value=128, max_value=1024, step=128)
  hp_rate = hp.Choice('rate', [0.0, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 0.9])
  hp_layers = hp.Int('layers', min_value = 1, max_value = 9, step = 1)
  hp_l2 = hp.Choice('lambda', [0.0, 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0])

  # model architecture
  model = keras.Sequential()
  model.add(keras.layers.Dense(units=hp_units, input_shape=(1024,)))
  for i in range(hp_layers):
      model.add(keras.layers.Dense(units=hp_units, kernel_regularizer=keras.regularizers.L2(hp_l2))),
      model.add(keras.layers.Dropout(rate=hp_rate))
  model.add(keras.layers.Dense(units=1))

  # model optimization
  model.compile(optimizer='adam',
                loss=tf.losses.MeanSquaredError(),
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model

In [45]:
import keras_tuner

# use Bayesian Optimization to find the best hyperparameters
tuner = keras_tuner.BayesianOptimization(createModel, max_trials = 100,  objective='val_loss')
tuner.search(X_train, y_train, validation_data=(X_dev, y_dev))
bestModel = tuner.get_best_models()[0]

In [46]:
def evaluateModel(model, X, y):
  print("RMSE:\t ", model.evaluate(X, y, verbose=False)[1])
  print("STD:\t ", y_test.std())

In [47]:
evaluateModel(bestModel, X_test, y_test)

RMSE:	  2.4562740325927734
STD:	  2.456186910349606


In [48]:
# RESULTS:
# - model evaluation shows that RMSE is approximately the std dev of test data
# - thus, the model DID NOT learn anything meaningful
# - the best model had a very high dropout rate

In [49]:
best_hp = tuner.get_best_hyperparameters()[0]
print("Units:\t", best_hp.get('units'))
print("Rate:\t", best_hp.get('rate'))
print("Layers:\t", best_hp.get('layers'))
print("Lambda:\t", best_hp.get('lambda'))

Units:	 128
Rate:	 0.9
Layers:	 5
Lambda:	 0.0
