# Finetuning of Neural Network Model for a New Site

In [1]:
import pandas as pd
import numpy as np
from joblib import load, dump
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.inspection import permutation_importance

import sys
sys.path.append('../')
import config

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

In [2]:
site = 'MHD'
site_name = config.site_dict[site]

transferred_site = 'GSN'
transferred_site_name = config.site_dict[transferred_site]

compound = config.compound

print(f"Finetuning a neural network model based on \033[1m{site_name}\033[0;0m to be applicable at \033[1m{transferred_site_name}\033[0;0m.")

Finetuning a neural network model based on [1mMace Head, Ireland[0;0m to be applicable at [1mGosan, South Korea[0;0m.


### Loading in Model & Initialising Data

In [3]:
# loading in model
original_model = load(data_path/f'nn_model_{site}.joblib')

# loading in training data
original_data = pd.read_csv(data_path/f'for_model_{compound}_{site}.csv', parse_dates=['time'])

training = original_data[(original_data['time'].dt.year >= 2016) & (original_data['time'].dt.year <= 2018)]
training = training.drop(columns=['time'])

X_train = training.drop(columns=['flag'])
y_train = training['flag']
y_train = y_train.dropna()
X_train = X_train.loc[y_train.index]

# training model on original site training data
original_model.fit(X_train, y_train)

In [4]:
# loading in data
data = pd.read_csv(data_path/f'for_model_{compound}_{transferred_site}.csv', parse_dates=['time'])
data.sample(5)

Unnamed: 0,time,flag,u10_0,u10_1,u10_2,u10_3,u10_4,u10_5,u10_6,u10_7,...,v500_2_past,v500_3_past,v500_4_past,v500_5_past,v500_6_past,v500_7_past,v500_8_past,v500_13_past,v500_14_past,v500_15_past
6387,2013-03-17 03:00:00,1.0,-2.534051,-0.383627,3.186323,0.65929,-3.891838,-1.558571,0.771371,-5.957251,...,3.116105,-3.509091,-0.834657,2.346402,-0.641657,-1.307968,6.003069,-1.114968,-2.421166,-0.016704
6304,2013-02-23 05:00:00,1.0,2.615645,6.656462,9.826373,4.090222,2.26304,-1.67927,-3.596373,-1.309489,...,-11.102421,-8.887543,-7.811907,-14.584892,-10.239186,-15.207329,-14.961988,-1.848773,1.722293,-16.225037
10065,2017-02-10 15:00:00,1.0,6.233916,3.567819,3.81769,3.398999,8.864523,7.446825,-0.613816,5.819541,...,-9.765146,-14.710276,-1.067909,-9.062006,-7.778808,-24.811352,-20.20256,-3.255318,-11.194578,-21.482105
5453,2012-03-29 23:00:00,1.0,1.126471,0.854506,6.679174,0.501291,-5.972256,-1.068994,0.272773,2.581086,...,6.064084,3.146364,1.281326,-3.964563,2.267668,3.212704,3.294065,-5.834608,0.357569,2.516757
2307,2009-11-03 07:00:00,1.0,0.855831,2.172136,9.000013,1.054592,-6.778291,-6.584008,-2.748751,1.178888,...,-25.025118,-19.689665,3.85775,-6.332026,-9.478883,-6.238,-1.566728,3.056531,-2.046859,-6.744138


In [5]:
# setting up data for finetuning
train_data_ft = data[(data['time'].dt.year >= 2013) & (data['time'].dt.year <= 2014)]
val_data_ft = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2015)]
test_data_ft = data[(data['time'].dt.year >= 2016) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data_ft['time'].min()} -> {train_data_ft['time'].max()}. Length: {len(train_data_ft)}")
print(f"Val range: {val_data_ft['time'].min()} -> {val_data_ft['time'].max()}. Length: {len(val_data_ft)}")
print(f"Test range: {test_data_ft['time'].min()} -> {test_data_ft['time'].max()}. Length: {len(test_data_ft)}")

train_data_ft = train_data_ft.drop(columns=['time'])
val_data_ft = val_data_ft.drop(columns=['time'])
test_data_ft = test_data_ft.drop(columns=['time'])

X_train_ft = train_data_ft.drop(columns=['flag'])
y_train_ft = train_data_ft['flag']
X_val_ft = val_data_ft.drop(columns=['flag'])
y_val_ft = val_data_ft['flag']
X_test_ft = test_data_ft.drop(columns=['flag'])
y_test_ft = test_data_ft['flag']

y_train_ft = y_train_ft.dropna()
y_val_ft = y_val_ft.dropna()
y_test_ft = y_test_ft.dropna()

X_train_ft = X_train_ft.loc[y_train_ft.index]
X_val_ft = X_val_ft.loc[y_val_ft.index]
X_test_ft = X_test_ft.loc[y_test_ft.index]

Train range: 2013-01-01 01:00:00 -> 2014-12-31 23:00:00. Length: 2004
Val range: 2015-01-01 01:00:00 -> 2015-12-31 19:00:00. Length: 812
Test range: 2016-01-01 21:00:00 -> 2017-12-31 23:00:00. Length: 2012


In [6]:
# setting up data for full retuning
train_data = data[(data['time'].dt.year >= 2009) & (data['time'].dt.year <= 2013)]
val_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2014)]
test_data = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Val range: {val_data['time'].min()} -> {val_data['time'].max()}. Length: {len(val_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

train_data = train_data.drop(columns=['time'])
val_data = val_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_val = val_data.drop(columns=['flag'])
y_val = val_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

y_train = y_train.dropna()
y_val = y_val.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_val = X_val.loc[y_val.index]
X_test = X_test.loc[y_test.index]

Train range: 2009-01-01 01:00:00 -> 2013-12-30 09:00:00. Length: 5763
Val range: 2014-01-03 19:00:00 -> 2014-12-31 23:00:00. Length: 933
Test range: 2015-01-01 01:00:00 -> 2017-12-31 23:00:00. Length: 2824


### Testing Existing Model on Data

In [7]:
y_val_pred = original_model.predict(X_val)

precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print(f'{site} Model Precision: {precision:.2f}')
print(f'{site} Model Recall: {recall:.2f}')
print(f'{site} Model F1: {f1:.2f}')

MHD Model Precision: 0.87
MHD Model Recall: 0.80
MHD Model F1: 0.83


### Finetuning Model

In [8]:
# allowing warm start & therefore fine-tuning
original_model.warm_start = True

# adding more iterations to the model
original_model.max_iter += 250

# fitting the model to the new data (one years worth)
original_model.fit(X_train_ft, y_train_ft)

In [9]:
# evaluating model on validation set
y_val_pred = original_model.predict(X_val_ft)

precision = precision_score(y_val_ft, y_val_pred)
recall = recall_score(y_val_ft, y_val_pred)
f1 = f1_score(y_val_ft, y_val_pred)

print(f'Finetuned Model Precision: {precision:.2f}')
print(f'Finetuned Model Recall: {recall:.2f}')
print(f'Finetuned Model F1: {f1:.2f}')

Finetuned Model Precision: 0.94
Finetuned Model Recall: 0.21
Finetuned Model F1: 0.35


In [10]:
# exploring the distribution of the predictions
y_val_pred_int = y_val_pred.astype(int)

counts = np.bincount(y_val_pred_int)

print("Number of non-baselines:", counts[0])
print("Number of baselines:", counts[1])

assert counts[0] > 0, "Model has predicted no non-baselines. This is likely due to the model being overfit."

Number of non-baselines: 681
Number of baselines: 131


In [11]:
# saving model
dump(original_model, data_path/f'nn_model_{transferred_site}_finetuned.joblib')

['C:\\Users\\kirst\\OneDrive\\Kirstin\\Uni\\Year4\\MSciProject\\data_files\\saved_files\\nn_model_GSN_finetuned.joblib']

### Retuning Model Completely

In [12]:
new_model = MLPClassifier(random_state=42,
                         max_iter=1000, 
                         hidden_layer_sizes=(100,), 
                         shuffle=False,
                         activation='relu', 
                         solver='adam', 
                         alpha=0.05, 
                         learning_rate='constant', 
                         batch_size=100, 
                         early_stopping=True,
                         learning_rate_init=0.001,
                         beta_2=0.9,)

new_model.fit(X_train, y_train)

y_val_pred = new_model.predict(X_val)
y_train_pred = new_model.predict(X_train)

precision_val = precision_score(y_val, y_val_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_val = recall_score(y_val, y_val_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_val = f1_score(y_val, y_val_pred)
f1_train = f1_score(y_train, y_train_pred)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")

Precision on Training Set = 0.863
Precision on Validation Set = 0.802
Recall on Training Set = 0.977
Recall on Validation Set = 0.988
F1 Score on Training Set = 0.916
F1 Score on Validation Set = 0.885


In [13]:
# evaluating model on test set
y_test_pred = new_model.predict(X_test)

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Test Precision: {precision:.3f}")
print(f"Test Recall: {recall:.3f}")
print(f"Test F1 Score: {f1:.3f}")

Test Precision: 0.813
Test Recall: 0.983
Test F1 Score: 0.890
