<center>

# **PREDICTIVE METHOD**<br>
# **NEURAL NETWORK**<br>

by: Ly Nguyen

</center>


In [1]:
# Import necessary libraries for this notebook: 

# Read from SQLite database and load to a pandas dataframe
import os
import sqlite3
import pandas as pd

# For using arrays 
import numpy as np

# For ML work (data preprocessing, hyperparameter tuning, Random Forest Classifier, training & testing sets, and stratified sampling)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight



# For model evaluation, including explainability:  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, balanced_accuracy_score, make_scorer
from sklearn.utils.class_weight import compute_class_weight
import statsmodels.api as sm
import shap

# For data visualization 
import matplotlib.pyplot as plt
import seaborn as sns

# For saving the model into a pkl file
import joblib



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import keras_tuner as kt

In [3]:
# Import necessary libraries
import random
import numpy as np
import tensorflow as tf

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)


# Load file

In [4]:
# Load the saved df_prelim parquet file: 
relative_path = os.path.join("..", "src", "df_reduced.parquet")
df_reduced = pd.read_parquet(relative_path)

# Split training & test sets

In [5]:
# Define X and y:
X = df_reduced.drop(columns=['delayType'])  # Use parentheses with the 'columns' argument
y = df_reduced['delayType']


In [6]:
# Perform stratified split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


In [7]:
# One-hot encode the target for neural network
y_train_encoded = to_categorical(y_train - 1)  # Zero-indexed for NN
y_val_encoded = to_categorical(y_val - 1)
y_test_encoded = to_categorical(y_test - 1)

# Hyperparameter tuning

In [8]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train),
                                     y=y_train)
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)

Class weights: {0: 3.1258878275777615, 1: 0.4759294477383749, 2: 1.7272973338746787}


In [9]:
def build_model(hp):
    model = Sequential()
    model.add(Dense(hp.Int('units', min_value=64, max_value=256, step=64),
                    activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(hp.Float('dropout', 0.1, 0.5, step=0.1)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [0.001, 0.0001])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model



In [10]:
tuner = kt.Hyperband(build_model, 
                     objective='val_accuracy', 
                     max_epochs=30, 
                     factor=3)

# Hyperparameter search
tuner.search(X_train, y_train_encoded, 
             validation_data=(X_val, y_val_encoded), 
             class_weight=class_weights)





Reloading Tuner from .\untitled_project\tuner0.json


In [11]:
# Display the best hyperparameters 
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)   

{'units': 256, 'dropout': 0.2, 'learning_rate': 0.001, 'tuner/epochs': 10, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}


In [12]:
# Print the best model's architecture 
best_model = tuner.hypermodel.build(best_hps)
best_model.summary()  # This will print the architecture


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               13056     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                16448     
                                                                 
 dense_2 (Dense)             (None, 3)                 195       
                                                                 
Total params: 29,699
Trainable params: 29,699
Non-trainable params: 0
_________________________________________________________________


# Fitting the model

In [13]:
history = best_model.fit(X_train, y_train_encoded,
                         validation_data=(X_val, y_val_encoded),
                         class_weight=class_weights,
                         epochs=30,
                         batch_size=32,
                         verbose=1)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [14]:
y_pred_proba = best_model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1) + 1  # Convert back to original labels




# Model Evaluation

In [15]:
train_score = best_model.evaluate(X_train, y_train_encoded, verbose=0)
test_score = best_model.evaluate(X_test, y_test_encoded, verbose=0)
print(f"Training Data Score: {train_score[1]:.2f}")
print(f"Testing Data Score: {test_score[1]:.2f}")


Training Data Score: 0.66
Testing Data Score: 0.55


*Observation:*
- The Neural Network model performs worse on the testing set than the training set, suggesting there may be some overfitting during training.

In [19]:
# Evaluation scores
print(classification_report(y_test, y_pred))

balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {balanced_acc:.2f}")


              precision    recall  f1-score   support

           1       0.30      0.65      0.41       292
           2       0.84      0.50      0.63      1916
           3       0.36      0.65      0.46       528

    accuracy                           0.55      2736
   macro avg       0.50      0.60      0.50      2736
weighted avg       0.69      0.55      0.57      2736

Balanced Accuracy: 0.60


# **Conclusion:**
- This 5th model performs worse than the 3rd model across the scores. 
- The 3rd model is the optimal one so far.

---
---