In [3]:
# === Mount Google Drive ===
from google.colab import drive
drive.mount('/content/drive')

# === Import libraries ===
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
!pip install keras-tuner
import keras_tuner as kt

# === Load Data ===
# df = pd.read_csv("./../../Data/preprocessed_datasets/interactions_prepped_stage3.csv")
file_path = '/content/drive/My Drive/interactions_prepped_stage3.csv'
df = pd.read_csv(file_path)

# === Feature and Target Selection ===
target_col = "Energy"
y = df[target_col]

num_cols = ["R", "r", "Phi", "Theta", "Q", "P1/P2"]
cat_cols = ["Amino_acid", "Carbohydrate", "small_H_position"]

# Numeric data columns
X_num = df[num_cols]

# One-hot encode categorical columns
X_cat = pd.get_dummies(df[cat_cols], prefix=["aa", "carb", "small_h"], drop_first=True)

# Combine numeric and categorical columns
X_all = pd.concat([X_num, X_cat], axis=1)

# === Normalize Numeric Columns ===
scaler = StandardScaler()
X_all[num_cols] = scaler.fit_transform(X_all[num_cols])

# === Keras Model Builder with Tunable Parameters ===
def build_model(hp):
    inp = tf.keras.Input(shape=(X_all.shape[1],))
    x = tf.keras.layers.Flatten()(inp)

    for i in range(hp.Int("num_layers", 2, 5)):
        x = tf.keras.layers.Dense(
            hp.Int(f"units_{i}", 32, 96, step=16),
            activation=hp.Choice("activation", ["relu", "tanh"]),
            kernel_initializer="he_normal",
            kernel_regularizer=tf.keras.regularizers.l2(
                hp.Float("l2", 1e-6, 1e-2, sampling="log")
            ),
        )(x)
        if hp.Boolean("use_dropout"):
            x = tf.keras.layers.Dropout(hp.Float("dropout", 0.1, 0.5, step=0.1))(x)

    out = tf.keras.layers.Dense(1)(x)

    model = tf.keras.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Float("lr", 1e-4, 1e-2, sampling="log")
        ),
        loss="mse",
        metrics=["mae"],
    )
    return model


# === Hyperband Tuner Setup ===
tuner = kt.Hyperband(
    build_model,
    objective="val_loss",
    max_epochs=100,
    factor=3,
    directory="kt_tuning",
    project_name="energy_prediction",
    overwrite=True,
)


Mounted at /content/drive
Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [4]:
# === Early Stopping Callback ===
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=30)

# === K-Fold Cross-Validation Loop ===
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_all)):
    print(f"\nFold {fold+1} Training...")

    X_train, X_val = X_all.iloc[train_idx], X_all.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    tuner.search(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        callbacks=[early_stop],
        verbose=1,
    )

    best_hp = tuner.get_best_hyperparameters(1)[0]
    best_model = tuner.hypermodel.build(best_hp)
    best_model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=32,
        verbose=1,
    )

    # Evaluate
    y_pred = best_model.predict(X_val).flatten()
    fold_mse = mean_squared_error(y_val, y_pred)
    fold_mae = mean_absolute_error(y_val, y_pred)
    fold_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    fold_r2 = r2_score(y_val, y_pred)
    fold_results.append((fold_mse, fold_mae, fold_rmse, fold_r2))

# === Report Summary ===
print("\n=== Cross-Validated Results ===")
for i, (mse, mae, rmse, r2) in enumerate(fold_results):
    print(f"Fold {i+1}: MSE={mse:.3f}, MAE={mae:.3f}, RMSE={rmse:.3f}, R2={r2:.3f}")

avg_mse = np.mean([res[0] for res in fold_results])
avg_mae = np.mean([res[1] for res in fold_results])
avg_rmse = np.mean([res[2] for res in fold_results])
avg_r2 = np.mean([res[3] for res in fold_results])

print(
    f"\nFinal Avg Results: MSE={avg_mse:.3f}, MAE={avg_mae:.3f}, RMSE={avg_rmse:.3f}, R2={avg_r2:.3f}"
)

Trial 254 Complete [00h 00m 32s]
val_loss: 64.022705078125

Best val_loss So Far: 41.7783203125
Total elapsed time: 00h 32m 37s
Epoch 1/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - loss: 415.5855 - mae: 17.9067 - val_loss: 127.0455 - val_mae: 8.8940
Epoch 2/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 114.2130 - mae: 8.2124 - val_loss: 66.7731 - val_mae: 6.7951
Epoch 3/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 63.4304 - mae: 6.3857 - val_loss: 64.6079 - val_mae: 6.6259
Epoch 4/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 61.5301 - mae: 6.2737 - val_loss: 55.9606 - val_mae: 6.1229
Epoch 5/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 56.5274 - mae: 5.8860 - val_loss: 51.5982 - val_mae: 5.7711
Epoch 6/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 48.21



[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 54ms/step



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step

Fold 4 Training...
Epoch 1/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 418.2812 - mae: 18.0245 - val_loss: 138.0178 - val_mae: 9.1540
Epoch 2/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 100.0199 - mae: 7.8081 - val_loss: 71.8699 - val_mae: 6.8658
Epoch 3/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 66.8354 - mae: 6.4525 - val_loss: 61.1219 - val_mae: 6.2169
Epoch 4/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 66.4538 - mae: 6.5326 - val_loss: 53.2084 - val_mae: 5.7581
Epoch 5/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 56.8372 - mae: 5.9664 - val_loss: 47.8352 - val_mae: 5.4232
Epoch 6/100
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 52.9149 - mae: 5.5182 - val_loss: 44.4076

In [10]:
# Get top N best hyperparameter sets
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print all hyperparameters in the best configuration
print("\nBest Hyperparameters:")
for param in best_hps.values:
    print(f"{param}: {best_hps.get(param)}")


Best Hyperparameters:
num_layers: 2
units_0: 64
activation: tanh
l2: 1.075558357790729e-05
use_dropout: True
units_1: 96
lr: 0.005711014933026775
dropout: 0.2
units_2: 96
units_3: 80
units_4: 96
tuner/epochs: 34
tuner/initial_epoch: 12
tuner/bracket: 4
tuner/round: 3
tuner/trial_id: 0138


In [9]:
best_model = tuner.hypermodel.build(best_hps)

print("Best Model Summary:")
best_model.summary()

Best Model Summary:
