# Neural Networks

In [1]:
import pandas as pd
from joblib import dump
from pathlib import Path
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

### Initialising

In [2]:
# initialising WandB
import wandb, os

os.environ["WANDB_API_KEY"] = "e84d2e19bd2cc42ec6e5d232cd0b6f0fe41f2189"
os.environ["WANDB_NOTEBOOK_NAME"] = "NN_models.ipynb"


# Syntax for using WandB:

# wandb.init(project="MSciProject", name="name", notebook="your-notebook-name")
# code here
# wandb.finish()

In [3]:
# Load data
data = pd.read_csv(data_path/'for_model.csv', parse_dates=['time'])

data.head()

Unnamed: 0,time,flag,u10_0,u10_1,u10_2,u10_3,u10_4,u10_5,u10_6,u10_7,...,v500_2_past,v500_3_past,v500_4_past,v500_5_past,v500_6_past,v500_7_past,v500_8_past,v500_13_past,v500_14_past,v500_15_past
0,1998-01-02 07:50:00,1.0,14.852083,9.32453,1.611875,7.808569,18.99734,19.074768,17.866888,21.336483,...,-1.01051,-3.850661,-18.958298,-12.515166,-1.018695,13.810657,17.628082,-19.750593,-15.995375,16.693369
1,1998-01-02 15:52:00,0.0,14.592901,14.594532,0.362429,10.770398,14.584751,13.906645,10.283009,12.536574,...,17.780321,13.497995,-6.243918,1.610286,-9.154458,5.615963,22.95971,-21.638025,-12.137025,-1.25933
2,1998-01-04 16:37:00,1.0,12.714251,10.323761,1.472504,15.92548,20.108229,18.527065,14.082691,16.077892,...,16.608246,8.840803,3.836572,-9.439291,-40.350285,-2.100735,1.187947,-20.215494,-7.206458,-8.54714
3,1998-01-04 20:38:00,1.0,12.595257,9.802139,1.850679,13.43963,17.74219,14.301935,15.184616,11.609882,...,11.294628,-0.735498,-11.130286,-38.63146,-19.680204,-11.981513,-1.061256,-15.533747,-6.54512,-23.261904
4,1998-01-05 00:39:00,1.0,9.671734,7.063629,1.205173,11.726432,13.065531,16.226229,17.666391,3.23868,...,6.038302,-3.164769,-24.450348,-10.958404,-2.891393,10.613646,4.468444,-15.044292,-1.735689,20.71378


In [4]:
# Convert "time" column to datetime format
#data['time'] = pd.to_datetime(data['time'], format='%d/%m/%Y %H:%M')

# Split the data into training and testing sets based on the date
train_data = data[(data['time'].dt.year >= 2016) & (data['time'].dt.year <= 2018)]
val_data = data[(data['time'].dt.year >= 2019) & (data['time'].dt.year <= 2019)]
test_data = data[(data['time'].dt.year >= 2020) & (data['time'].dt.year <= 2023)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Val range: {val_data['time'].min()} -> {val_data['time'].max()}. Length: {len(val_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

# saving the date ranges for WandB tracking
# training_date_range = "2016"
# validation_date_range = "2020-01-01 to 2020-12-31"
# testing_date_range = "2020-01-01 to 2022-12-31"

# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
val_data = val_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_val = val_data.drop(columns=['flag'])
y_val = val_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

# Balanced Data - removing NaN values and associated data
y_train = y_train.dropna()
y_val = y_val.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_val = X_val.loc[y_val.index]
X_test = X_test.loc[y_test.index]

Train range: 2016-01-01 02:09:00 -> 2018-12-31 16:10:00. Length: 3118
Val range: 2019-01-01 13:50:00 -> 2019-12-29 18:29:00. Length: 642
Test range: 2020-01-03 00:50:00 -> 2022-12-31 08:37:00. Length: 2978


### Models

#### Default Parameters

NN model with original parameters, evaluating based on performance on validation set.

In [5]:
# setting up a neural network model with default parameters
nn_model = MLPClassifier(max_iter=1000, random_state=42)

nn_model.fit(X_train, y_train)

# Predictions
y_pred_val = nn_model.predict(X_val)
y_pred_train = nn_model.predict(X_train)

# calculating scores
precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")

Precision on Training Set = 0.907
Precision on Validation Set = 0.870
Recall on Training Set = 0.918
Recall on Validation Set = 0.874
F1 Score on Training Set = 0.913
F1 Score on Validation Set = 0.872


#### Optimising Model

##### Grid Search for Hyperparameter Tuning

In [38]:
model = MLPClassifier(random_state=42)

# hyperparameters to explore
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
    'batch_size': [100, 200, 300],
    'max_iter': [1000, 2000],
    'early_stopping': [True, False]
}

grid_search = GridSearchCV(model, param_grid, n_jobs=-1, scoring='f1', cv=5)

# Fit the grid search
grid_search.fit(X_train, y_train)

# extracting best parameters and score
results = grid_search.best_estimator_

validation_f1 = results.score(X_val, y_val)

print(f'Validation F1 Score: {validation_f1:.3f}')
print(f'Best Parameters: {grid_search.best_params_}')

Best parameters: {'activation': 'relu', 'alpha': 0.05, 'batch_size': 100, 'early_stopping': True, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 1000, 'solver': 'adam'}
Best F1 score: 0.898


#### Exploring Optimised Hyperparameters

In [6]:
# wandb.init(project="NeuralNetworks")

nn_model = MLPClassifier(random_state=42,
                         max_iter=1000, 
                         hidden_layer_sizes=(100,), 
                         shuffle=False,
                         activation='relu', 
                         solver='adam', 
                         alpha=0.05, 
                         learning_rate='constant', 
                         batch_size=100, 
                         early_stopping=True,
                         learning_rate_init=0.001,
                         beta_2=0.9,)

nn_model.fit(X_train, y_train)

# Predictions
y_pred_val = nn_model.predict(X_val)
y_pred_train = nn_model.predict(X_train)

# calculating scores
precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Testing Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Testing Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Testing Set = {f1_val:.3f}")

# wandb.log({"model_name":"Neural Network", "training_precision":precision_train, "testing_precision":precision_test, 
            # "training_recall":recall_train, "testing_recall":recall_test, "training_f1":f1_train, "testing_f1":f1_test,
            # "training date range": training_date_range, "testing date range": testing_date_range})

# wandb.finish()

Precision on Training Set = 0.883
Precision on Testing Set = 0.833
Recall on Training Set = 0.962
Recall on Testing Set = 0.932
F1 Score on Training Set = 0.921
F1 Score on Testing Set = 0.880


##### Evaluating Model on Test Data

In [7]:
y_pred_test = nn_model.predict(X_test)

precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"Precision on Testing Set = {precision_test:.3f}")
print(f"Recall on Testing Set = {recall_test:.3f}")
print(f"F1 Score on Testing Set = {f1_test:.3f}")

Precision on Testing Set = 0.891
Recall on Testing Set = 0.954
F1 Score on Testing Set = 0.921


In [46]:
# saving the model
dump(nn_model, data_path/'nn_model.joblib')

['C:\\Users\\kirst\\OneDrive\\Kirstin\\Uni\\Year4\\MSciProject\\data_files\\saved_files\\nn_model.joblib']