# Neural Networks

In [1]:
import pandas as pd
from joblib import dump
from pathlib import Path
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score

import sys
sys.path.append('../')
import config

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

In [2]:
site = config.site
site_name = config.site_dict[site]
compound = config.compound

print(f"Exploring {compound} at {site_name}.")

Exploring sf6 at Gosan, South Korea.


### Initialising

In [3]:
# Load data
data = pd.read_csv(data_path/f'for_model_{compound}_{site}.csv', parse_dates=['time'])

data.head()

Unnamed: 0,time,flag,u10_0,u10_1,u10_2,u10_3,u10_4,u10_5,u10_6,u10_7,...,v500_2_past,v500_3_past,v500_4_past,v500_5_past,v500_6_past,v500_7_past,v500_8_past,v500_13_past,v500_14_past,v500_15_past
0,2007-11-12 09:00:00,1.0,-0.206935,2.064226,1.21283,1.286915,1.535215,-3.18769,-2.538291,-2.120406,...,-19.086115,-22.730377,-7.834412,-8.643053,-2.242004,-5.636626,4.728797,-1.143111,4.00974,0.372644
1,2007-11-12 11:00:00,1.0,-0.428611,1.530584,2.35825,1.200097,-0.030984,-2.226324,-1.164828,-1.339043,...,-13.783958,-26.73178,-8.481802,-8.374302,-1.887253,-5.352347,2.065177,-0.155302,4.334629,1.973205
2,2007-11-12 13:00:00,1.0,-0.483596,1.543896,2.329311,1.201254,-1.418916,-2.85836,-0.039087,-2.138927,...,-9.76464,-26.44989,-10.537448,-8.391025,-2.325615,-2.430727,-0.247275,-0.39897,3.844906,2.299289
3,2007-11-12 15:00:00,1.0,-0.732474,1.833869,1.425824,0.902021,-2.454367,-2.814372,0.606261,-2.136612,...,-9.038415,-23.884214,-11.67934,-7.968189,-2.319643,-0.253247,-0.155302,-0.631887,2.570429,3.105542
4,2007-11-12 17:00:00,1.0,-0.446553,1.407882,0.664718,1.079709,-3.140808,-2.384333,0.771794,-1.429334,...,-9.517389,-21.478596,-10.475337,-6.289989,-1.311528,2.177455,-1.407084,-1.211195,2.343484,2.292123


In [4]:
if site == "MHD":
    train_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2018)]
    val_data = data[(data['time'].dt.year >= 2019) & (data['time'].dt.year <= 2019)]
    test_data = data[(data['time'].dt.year >= 2020) & (data['time'].dt.year <= 2023)]

if site == "GSN":
    train_data = data[(data['time'].dt.year >= 2009) & (data['time'].dt.year <= 2013)]
    val_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2014)]
    test_data = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Val range: {val_data['time'].min()} -> {val_data['time'].max()}. Length: {len(val_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")


# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
val_data = val_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_val = val_data.drop(columns=['flag'])
y_val = val_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

# Balanced Data - removing NaN values and associated data
y_train = y_train.dropna()
y_val = y_val.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_val = X_val.loc[y_val.index]
X_test = X_test.loc[y_test.index]

Train range: 2009-01-01 01:00:00 -> 2013-12-30 09:00:00. Length: 5763
Val range: 2014-01-03 19:00:00 -> 2014-12-31 23:00:00. Length: 933
Test range: 2015-01-01 01:00:00 -> 2017-12-31 23:00:00. Length: 2824


### Models

#### Default Parameters

NN model with original parameters, evaluating based on performance on validation set.

In [5]:
# setting up a neural network model with default parameters
nn_model = MLPClassifier(max_iter=1000, random_state=42)

nn_model.fit(X_train, y_train)

# Predictions
y_pred_val = nn_model.predict(X_val)
y_pred_train = nn_model.predict(X_train)

# calculating scores
precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")

Precision on Training Set = 0.850
Precision on Validation Set = 0.778
Recall on Training Set = 0.988
Recall on Validation Set = 0.996
F1 Score on Training Set = 0.914
F1 Score on Validation Set = 0.873


In [6]:
# evaluating the model on the test set
y_pred_test = nn_model.predict(X_test)

precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"Precision on Test Set = {precision_test:.3f}")
print(f"Recall on Test Set = {recall_test:.3f}")
print(f"F1 Score on Test Set = {f1_test:.3f}")

Precision on Test Set = 0.797
Recall on Test Set = 0.992
F1 Score on Test Set = 0.884


#### Optimising Model

##### Grid Search for Hyperparameter Tuning

In [7]:
model = MLPClassifier(random_state=42)

# hyperparameters to explore
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
    'batch_size': [100, 200, 300],
    'max_iter': [1000, 2000],
    'early_stopping': [True, False]
}

grid_search = GridSearchCV(model, param_grid, n_jobs=-1, scoring='precision', cv=5)

# Fit the grid search
grid_search.fit(X_train, y_train)

# extracting best parameters and score
results = grid_search.best_estimator_

validation_f1 = results.score(X_val, y_val)

print(f'Validation F1 Score: {validation_f1:.3f}')
print(f'Best Parameters: {grid_search.best_params_}')

Validation F1 Score: 0.405
Best Parameters: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 100, 'early_stopping': False, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 1000, 'solver': 'adam'}


#### Exploring Optimised Hyperparameters

In [8]:
nn_model = MLPClassifier(random_state=42,
                         max_iter=1000, 
                         hidden_layer_sizes=(100,), 
                         shuffle=False,
                         activation='relu', 
                         solver='adam', 
                         alpha=0.0001, 
                         learning_rate='constant', 
                         batch_size=100, 
                         early_stopping=False,
                         learning_rate_init=0.0001,
                         beta_2=0.9,)

nn_model.fit(X_train, y_train)

# Predictions
y_pred_val = nn_model.predict(X_val)
y_pred_train = nn_model.predict(X_train)

# calculating scores
precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Testing Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Testing Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Testing Set = {f1_val:.3f}")

Precision on Training Set = 0.905
Precision on Testing Set = 0.879
Recall on Training Set = 0.715
Recall on Testing Set = 0.773
F1 Score on Training Set = 0.799
F1 Score on Testing Set = 0.823


##### Evaluating Model on Test Data

In [9]:
y_pred_test = nn_model.predict(X_test)

precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"Precision on Testing Set = {precision_test:.3f}")
print(f"Recall on Testing Set = {recall_test:.3f}")
print(f"F1 Score on Testing Set = {f1_test:.3f}")

Precision on Testing Set = 0.890
Recall on Testing Set = 0.770
F1 Score on Testing Set = 0.826


In [46]:
# saving the model
dump(nn_model, data_path/'nn_model.joblib')

['C:\\Users\\kirst\\OneDrive\\Kirstin\\Uni\\Year4\\MSciProject\\data_files\\saved_files\\nn_model.joblib']