# Hello Hyperparameter optimization with Keras tuner

For more details on the bank customer churn binary classifier example, see ```hello_bank_customer_churn_binary_classifier.ipynb ```

In [1]:
# Prerequisites
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

print("Python Version: ", sys.version)
print("Numpy Version: ", np.__version__)
print("Pandas Version: ", pd.__version__)

Python Version:  3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
Numpy Version:  2.0.2
Pandas Version:  2.2.3


### Load Data

In [2]:
data_path = "data/Churn_Modelling.csv"

df = pd.read_csv(data_path)
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Exploratory Data Analysis


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


Check possible values in Geogrpahy and Gender

In [4]:
df["Gender"].value_counts()

Gender
Male      5457
Female    4543
Name: count, dtype: int64

In [5]:
df["Geography"].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [6]:
df["Exited"].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

Remove RowNumber, CustomerId and Surname columns from features.  Exited is the target/dependent variable.

In [7]:
X = df.drop(columns=["RowNumber", "CustomerId", "Surname", "Exited"])
y = df["Exited"]
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (10000, 10), y shape: (10000,)


### Feature Engineering

Separate categorical and numerical features

In [8]:
categ_columns = ["Geography", "Gender"]
num_columns = X.drop(columns=categ_columns).columns.tolist()
print(f"Categorical Columns: {categ_columns}")
print(f"Numerical Columns: {num_columns}")

Categorical Columns: ['Geography', 'Gender']
Numerical Columns: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']


Use Columntransformer with different transformations for numerical and categorical columns

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_columns),
    ('cat', OneHotEncoder(drop="first"), categ_columns)
])

### Split Dataset

60% training, 20% validation, 20% test

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.6, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)
print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}, X_test shape: {X_test.shape}")

X_train shape: (6000, 10), X_val shape: (2000, 10), X_test shape: (2000, 10)


### Feature Scaling

Use column transformer, only fit to Train data and use the training fit to scale all datasets 

In [11]:
X_train_scaled = ct.fit_transform(X_train)
X_val_scaled = ct.transform(X_val)
X_test_scaled = ct.transform(X_test)
print(f"X_train shape: {X_train_scaled.shape}, X_val shape: {X_val_scaled.shape}, X_test shape: {X_test_scaled.shape}")

X_train shape: (6000, 11), X_val shape: (2000, 11), X_test shape: (2000, 11)


### Build the ANN

In [12]:
def build_model(hp):
    """
    Build a ANN with hidden layers and sigmoid output layer

    Inputs: hp - Hyperparameters

    Outputs: model - ANN model
    """
    
    model = Sequential()

    # Hidden Layer 0
    model.add(Dense(units=hp.Int('units_0', min_value=32, max_value=128, step=32), 
                    activation=hp.Choice('activation', values=['relu','tanh','sigmoid']), 
                    input_shape=(X_train_scaled.shape[1],)))
    
    # Hidden Layer 1
    model.add(Dense(units=hp.Int('units_1', min_value=32, max_value=128, step=32), 
                    activation=hp.Choice('activation', values=['relu','tanh','sigmoid'])))

    # Output Layer
    model.add(Dense(units=1, activation='sigmoid'))

    # Compile 
    model.compile(optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


def evaluate_model(model, X_test, y_test):
    """
    Evaluate performance of an ANN model

    Inputs: model - trained ANN model
            X_test - test features
            y_test - test labels
            
    Outputs: loss - loss of the model
    """
    test_loss = model.evaluate(X_test, y_test, verbose=0)[0]

    return test_loss

# Number of trials and epochs
max_trials = 10
max_epochs = 3 

### Perfrom Grid Search

In [13]:
from keras_tuner import GridSearch

grid_tuner = GridSearch(
    build_model,
    objective='val_loss',
    max_trials=16,
    executions_per_trial=1,
    directory='hyperparam_tuning',
    project_name='grid_search',
    overwrite=True
)

time_start = time.time()
grid_tuner.search(X_train_scaled, y_train, epochs=max_epochs, validation_data=(X_val_scaled, y_val))
time_end = time.time()
grid_search_time = time_end - time_start
print(f"Random Search Time: {grid_search_time} seconds")


grid_search_best_model = grid_tuner.get_best_models(num_models=1)[0]
grid_search_test_loss = evaluate_model(grid_search_best_model, X_test_scaled, y_test)
grid_search_best_hyperparams = grid_tuner.get_best_hyperparameters(num_trials=1)[0].values
print(f"Random Search Test Loss: {grid_search_test_loss}")


Trial 16 Complete [00h 00m 02s]
val_loss: 0.3421882688999176

Best val_loss So Far: 0.3346933424472809
Total elapsed time: 00h 00m 32s
Random Search Time: 31.83399248123169 seconds


  saveable.load_own_variables(weights_store.get(inner_path))


Random Search Test Loss: 0.34734299778938293


### Perform Random Search

In [14]:
from keras_tuner import RandomSearch

random_tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=max_trials,
    directory='hyperparam_tuning',
    project_name='random_search',
    overwrite=True
)

time_start = time.time()
random_tuner.search(X_train_scaled, y_train, epochs=max_epochs, validation_data=(X_val_scaled, y_val))
time_end = time.time()
random_search_time = time_end - time_start
print(f"Random Search Time: {random_search_time} seconds")


random_search_best_model = random_tuner.get_best_models(num_models=1)[0]
random_search_test_loss = evaluate_model(random_search_best_model, X_test_scaled, y_test)
random_search_best_hyperparams = random_tuner.get_best_hyperparameters(num_trials=1)[0].values
print(f"Random Search Test Loss: {random_search_test_loss}")


Trial 10 Complete [00h 00m 02s]
val_loss: 0.4308781325817108

Best val_loss So Far: 0.34671905636787415
Total elapsed time: 00h 00m 20s
Random Search Time: 20.395037174224854 seconds
Random Search Test Loss: 0.3526424467563629


### Bayesian Optimization

In [15]:
from keras_tuner import BayesianOptimization

bayesian_tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=max_trials,
    directory='hyperparam_tuning',
    project_name='bayesian_search',
    overwrite=True
)

time_start = time.time()
bayesian_tuner.search(X_train_scaled, y_train, epochs=max_epochs, validation_data=(X_val_scaled, y_val))
time_end = time.time()
bayesian_search_time = time_end - time_start
print(f"Bayesian Search Time: {bayesian_search_time} seconds")

bayesian_search_best_model = bayesian_tuner.get_best_models(num_models=1)[0]
bayesian_search_test_loss = evaluate_model(bayesian_search_best_model, X_test_scaled, y_test)
bayesian_search_best_hyperparams = bayesian_tuner.get_best_hyperparameters(num_trials=1)[0].values
print(f"Bayesian Search Test Loss: {bayesian_search_test_loss}")

Trial 10 Complete [00h 00m 02s]
val_loss: 0.40245988965034485

Best val_loss So Far: 0.3413819670677185
Total elapsed time: 00h 00m 21s
Bayesian Search Time: 21.500250577926636 seconds
Bayesian Search Test Loss: 0.3549537658691406


### Hyperband Optimization

In [16]:
from keras_tuner import Hyperband

hyperband_tuner = Hyperband(
    build_model,
    objective='val_loss',
    #max_epochs=max_epochs,
    max_epochs=16,
    factor=4,
    seed=0,
    directory='hyperparam_tuning',
    project_name='hyperband_search',
    overwrite=True
)

time_start = time.time()
hyperband_tuner.search(X_train_scaled, y_train, validation_data=(X_val_scaled, y_val))
time_end = time.time()
hyperband_search_time = time_end - time_start
print(f"Bayesian Search Time: {hyperband_search_time} seconds")

hyperband_search_best_model = hyperband_tuner.get_best_models(num_models=1)[0]
hyperband_search_test_loss = evaluate_model(hyperband_search_best_model, X_test_scaled, y_test)
hyperband_search_best_hyperparams = hyperband_tuner.get_best_hyperparameters(num_trials=1)[0].values
print(f"Bayesian Search Test Loss: {hyperband_search_test_loss}")

Trial 32 Complete [00h 00m 06s]
val_loss: 0.4487195909023285

Best val_loss So Far: 0.3302502930164337
Total elapsed time: 00h 01m 24s
Bayesian Search Time: 84.47210788726807 seconds
Bayesian Search Test Loss: 0.3480880558490753


### Summarize Results

In [21]:
pd.set_option('display.max_colwidth', None)

# Collect results
results = {
    "Method": ["Grid Search", "Random Search", "Bayesian Optimization", "Hyperband"],
    "Test Loss": [grid_search_test_loss, 
                  random_search_test_loss, 
                  bayesian_search_test_loss, 
                  hyperband_search_test_loss],
    "Time (seconds)": [grid_search_time, random_search_time, bayesian_search_time, hyperband_search_time],
    "Best Hyperparameters": [
        grid_search_best_hyperparams,
        random_search_best_hyperparams,
        bayesian_search_best_hyperparams,
        hyperband_search_best_hyperparams
    ]
}

# Display results in DataFrame 
df_results = pd.DataFrame(results)
print(df_results)

                  Method  Test Loss  Time (seconds)  \
0            Grid Search   0.350044       40.798893   
1          Random Search   0.352852       26.680031   
2  Bayesian Optimization   0.352843       28.433676   
3              Hyperband   0.347856      111.483534   

                                                                                                                                                                         Best Hyperparameters  
0                                                                                                                {'units_0': 32, 'activation': 'relu', 'units_1': 128, 'learning_rate': 0.01}  
1                                                                                                                {'units_0': 128, 'activation': 'relu', 'units_1': 64, 'learning_rate': 0.01}  
2                                                                                                                 {'units_0': 32, 'activation': 'tan

In [17]:
grid_tuner.results_summary()

Results summary
Results in hyperparam_tuning\grid_search
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 09 summary
Hyperparameters:
units_0: 32
activation: relu
units_1: 128
learning_rate: 0.01
Score: 0.3346933424472809

Trial 06 summary
Hyperparameters:
units_0: 32
activation: relu
units_1: 96
learning_rate: 0.01
Score: 0.33923086524009705

Trial 15 summary
Hyperparameters:
units_0: 32
activation: tanh
units_1: 64
learning_rate: 0.01
Score: 0.3421882688999176

Trial 07 summary
Hyperparameters:
units_0: 32
activation: relu
units_1: 96
learning_rate: 0.001
Score: 0.34663891792297363

Trial 03 summary
Hyperparameters:
units_0: 32
activation: relu
units_1: 64
learning_rate: 0.01
Score: 0.34777432680130005

Trial 00 summary
Hyperparameters:
units_0: 32
activation: relu
units_1: 32
learning_rate: 0.01
Score: 0.347786545753479

Trial 12 summary
Hyperparameters:
units_0: 32
activation: tanh
units_1: 32
learning_rate: 0.01
Score: 0.35033971071243286

Trial 10 summary

In [19]:
random_tuner.results_summary()

Results summary
Results in hyperparam_tuning\random_search
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 01 summary
Hyperparameters:
units_0: 128
activation: relu
units_1: 96
learning_rate: 0.01
Score: 0.34671905636787415

Trial 00 summary
Hyperparameters:
units_0: 96
activation: tanh
units_1: 64
learning_rate: 0.01
Score: 0.3521117866039276

Trial 03 summary
Hyperparameters:
units_0: 32
activation: relu
units_1: 64
learning_rate: 0.001
Score: 0.3555898070335388

Trial 05 summary
Hyperparameters:
units_0: 96
activation: sigmoid
units_1: 96
learning_rate: 0.001
Score: 0.4240589737892151

Trial 07 summary
Hyperparameters:
units_0: 64
activation: sigmoid
units_1: 64
learning_rate: 0.001
Score: 0.42578431963920593

Trial 08 summary
Hyperparameters:
units_0: 128
activation: sigmoid
units_1: 64
learning_rate: 0.001
Score: 0.4258114695549011

Trial 09 summary
Hyperparameters:
units_0: 32
activation: sigmoid
units_1: 96
learning_rate: 0.001
Score: 0.4308781325817108

In [18]:
bayesian_tuner.results_summary()

Results summary
Results in hyperparam_tuning\bayesian_search
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 01 summary
Hyperparameters:
units_0: 64
activation: relu
units_1: 128
learning_rate: 0.001
Score: 0.3413819670677185

Trial 04 summary
Hyperparameters:
units_0: 128
activation: relu
units_1: 32
learning_rate: 0.001
Score: 0.34141698479652405

Trial 00 summary
Hyperparameters:
units_0: 96
activation: relu
units_1: 32
learning_rate: 0.001
Score: 0.34678053855895996

Trial 03 summary
Hyperparameters:
units_0: 32
activation: sigmoid
units_1: 64
learning_rate: 0.01
Score: 0.36614981293678284

Trial 05 summary
Hyperparameters:
units_0: 32
activation: tanh
units_1: 64
learning_rate: 0.001
Score: 0.3743700087070465

Trial 07 summary
Hyperparameters:
units_0: 128
activation: sigmoid
units_1: 32
learning_rate: 0.01
Score: 0.3888043463230133

Trial 09 summary
Hyperparameters:
units_0: 96
activation: sigmoid
units_1: 64
learning_rate: 0.01
Score: 0.4024598896503448

In [20]:
hyperband_tuner.results_summary()

Results summary
Results in hyperparam_tuning\hyperband_search
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 0028 summary
Hyperparameters:
units_0: 128
activation: sigmoid
units_1: 96
learning_rate: 0.01
tuner/epochs: 16
tuner/initial_epoch: 4
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: 0025
Score: 0.3302502930164337

Trial 0020 summary
Hyperparameters:
units_0: 64
activation: relu
units_1: 96
learning_rate: 0.01
tuner/epochs: 16
tuner/initial_epoch: 4
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 0017
Score: 0.33236467838287354

Trial 0017 summary
Hyperparameters:
units_0: 64
activation: relu
units_1: 96
learning_rate: 0.01
tuner/epochs: 4
tuner/initial_epoch: 1
tuner/bracket: 2
tuner/round: 1
tuner/trial_id: 0011
Score: 0.33634331822395325

Trial 0027 summary
Hyperparameters:
units_0: 128
activation: relu
units_1: 32
learning_rate: 0.001
tuner/epochs: 16
tuner/initial_epoch: 4
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: 0023
Score: 0.3378597795963

In [27]:
# Access all trials
hyperband_all_trials = hyperband_tuner.oracle.trials
for trial_id, trial in hyperband_all_trials.items():
    print(f"Trial ID: {trial_id}")
    print(f"Hyperparameters: {trial.hyperparameters.values}")
    print(f"Score (Objective): {trial.score}")
    print(f"Trial Status: {trial.status}")
    print("-" * 50)

Trial ID: 0000
Hyperparameters: {'units_0': 64, 'activation': 'tanh', 'units_1': 128, 'learning_rate': 0.0001, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}
Score (Objective): 0.43843182921409607
Trial Status: COMPLETED
--------------------------------------------------
Trial ID: 0001
Hyperparameters: {'units_0': 128, 'activation': 'relu', 'units_1': 32, 'learning_rate': 0.01, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}
Score (Objective): 0.3627810776233673
Trial Status: COMPLETED
--------------------------------------------------
Trial ID: 0002
Hyperparameters: {'units_0': 64, 'activation': 'relu', 'units_1': 32, 'learning_rate': 0.001, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}
Score (Objective): 0.3681027889251709
Trial Status: COMPLETED
--------------------------------------------------
Trial ID: 0003
Hyperparameters: {'units_0': 32, 'activation': 'relu', 'units_1': 6