# Finetuning of Neural Network Model for a New Site

In [63]:
import pandas as pd
import numpy as np
from joblib import load, dump
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.inspection import permutation_importance

import sys
sys.path.append('../')
import config

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

In [64]:
site = 'MHD'
site_name = config.site_dict[site]

transferred_site = 'GSN'
transferred_site_name = config.site_dict[transferred_site]

compound = config.compound

print(f"Finetuning a neural network model based on \033[1m{site_name}\033[0;0m to be applicable at \033[1m{transferred_site_name}\033[0;0m.")

Finetuning a neural network model based on [1mMace Head, Ireland[0;0m to be applicable at [1mGosan, South Korea[0;0m.


### Loading in Model & Initialising Data

In [65]:
# loading in model
original_model = load(data_path/f'nn_model_{site}.joblib')

# loading in training data
original_data = pd.read_csv(data_path/f'for_model_{compound}_{site}.csv', parse_dates=['time'])

training = original_data[(original_data['time'].dt.year >= 2016) & (original_data['time'].dt.year <= 2018)]
training = training.drop(columns=['time'])

X_train = training.drop(columns=['flag'])
y_train = training['flag']
y_train = y_train.dropna()
X_train = X_train.loc[y_train.index]

# training model on original site training data
original_model.fit(X_train, y_train)

In [66]:
# loading in data
data = pd.read_csv(data_path/f'for_model_{compound}_{transferred_site}.csv', parse_dates=['time'])
data.sample(5)

Unnamed: 0,time,flag,u10_0,u10_1,u10_2,u10_3,u10_4,u10_5,u10_6,u10_7,...,v500_7_past,v500_8_past,v500_9_past,v500_10_past,v500_11_past,v500_12_past,v500_13_past,v500_14_past,v500_15_past,v500_16_past
4701,2011-08-13 03:00:00,1.0,-0.881042,-1.119953,-0.294964,0.66068,4.564447,2.332123,1.443673,0.984516,...,7.644339,4.067718,-0.310674,-4.241988,-1.066252,4.635225,2.467476,0.331072,-6.82547,-1.123993
6052,2012-12-18 05:00:00,1.0,4.048647,3.938265,-4.374005,3.932207,7.54855,2.019376,0.362979,3.888459,...,-1.294484,-16.213642,-17.856808,1.448437,14.860284,0.110616,3.191226,0.177895,-1.784846,-16.18259
9791,2016-12-30 13:00:00,1.0,1.05682,1.522341,8.544664,0.634604,-5.384145,-4.480169,0.148513,-2.79834,...,-1.742884,0.673661,-8.972642,-16.93361,-1.572505,-1.270082,4.801098,3.920806,0.134127,-1.85363
371,2008-01-23 17:00:00,1.0,5.752649,4.386697,3.51168,4.954211,9.705658,3.22911,-0.050717,2.818316,...,8.931255,6.070133,5.884708,-6.165054,-3.736271,-7.964544,-0.486044,1.930249,3.528942,2.904453
10709,2017-11-11 09:00:00,1.0,-0.853929,1.777461,0.458568,0.083797,2.077065,-3.0679,-2.061937,-2.960747,...,-2.262307,-3.014819,-14.346397,-6.866427,2.280358,2.266561,7.707224,2.73939,0.005263,1.126506


In [67]:
# setting up data for finetuning
train_data_ft = data[(data['time'].dt.year >= 2011) & (data['time'].dt.year <= 2014)]
val_data_ft = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2015)]
test_data_ft = data[(data['time'].dt.year >= 2016) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data_ft['time'].min()} -> {train_data_ft['time'].max()}. Length: {len(train_data_ft)}")
print(f"Val range: {val_data_ft['time'].min()} -> {val_data_ft['time'].max()}. Length: {len(val_data_ft)}")
print(f"Test range: {test_data_ft['time'].min()} -> {test_data_ft['time'].max()}. Length: {len(test_data_ft)}")

train_data_ft = train_data_ft.drop(columns=['time'])
val_data_ft = val_data_ft.drop(columns=['time'])
test_data_ft = test_data_ft.drop(columns=['time'])

X_train_ft = train_data_ft.drop(columns=['flag'])
y_train_ft = train_data_ft['flag']
X_val_ft = val_data_ft.drop(columns=['flag'])
y_val_ft = val_data_ft['flag']
X_test_ft = test_data_ft.drop(columns=['flag'])
y_test_ft = test_data_ft['flag']

y_train_ft = y_train_ft.dropna()
y_val_ft = y_val_ft.dropna()
y_test_ft = y_test_ft.dropna()

X_train_ft = X_train_ft.loc[y_train_ft.index]
X_val_ft = X_val_ft.loc[y_val_ft.index]
X_test_ft = X_test_ft.loc[y_test_ft.index]

Train range: 2011-01-01 11:00:00 -> 2014-12-31 23:00:00. Length: 4119
Val range: 2015-01-01 01:00:00 -> 2015-12-31 19:00:00. Length: 812
Test range: 2016-01-01 21:00:00 -> 2017-12-31 23:00:00. Length: 2012


In [68]:
# setting up data for full retuning
train_data = data[(data['time'].dt.year >= 2009) & (data['time'].dt.year <= 2013)]
val_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2014)]
test_data = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Val range: {val_data['time'].min()} -> {val_data['time'].max()}. Length: {len(val_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

train_data = train_data.drop(columns=['time'])
val_data = val_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_val = val_data.drop(columns=['flag'])
y_val = val_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

y_train = y_train.dropna()
y_val = y_val.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_val = X_val.loc[y_val.index]
X_test = X_test.loc[y_test.index]

Train range: 2009-01-01 01:00:00 -> 2013-12-30 09:00:00. Length: 5763
Val range: 2014-01-03 19:00:00 -> 2014-12-31 23:00:00. Length: 933
Test range: 2015-01-01 01:00:00 -> 2017-12-31 23:00:00. Length: 2824


### Testing Existing Model on Data

In [69]:
y_val_pred = original_model.predict(X_val)

precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print(f'{site} Model Precision: {precision:.2f}')
print(f'{site} Model Recall: {recall:.2f}')
print(f'{site} Model F1: {f1:.2f}')

MHD Model Precision: 0.81
MHD Model Recall: 0.89
MHD Model F1: 0.84


### Finetuning Model

In [70]:
# allowing warm start & therefore fine-tuning
original_model.warm_start = True

# adding more iterations to the model
original_model.max_iter += 500

# fitting the model to the new data (one years worth)
original_model.fit(X_train_ft, y_train_ft)

In [76]:
# evaluating model on validation set
y_val_pred = original_model.predict(X_val_ft)

precision = precision_score(y_val_ft, y_val_pred)
recall = recall_score(y_val_ft, y_val_pred)
f1 = f1_score(y_val_ft, y_val_pred)

print(f'Finetuned Model Precision: {precision:.3f}')
print(f'Finetuned Model Recall: {recall:.3f}')
print(f'Finetuned Model F1: {f1:.3f}')

Finetuned Model Precision: 0.800
Finetuned Model Recall: 0.922
Finetuned Model F1: 0.856


In [77]:
# evaluating model on test set
y_test_pred = original_model.predict(X_test_ft)

precision = precision_score(y_test_ft, y_test_pred)
recall = recall_score(y_test_ft, y_test_pred)
f1 = f1_score(y_test_ft, y_test_pred)

print(f'Finetuned Model Precision: {precision:.3f}')
print(f'Finetuned Model Recall: {recall:.3f}')
print(f'Finetuned Model F1: {f1:.3f}')

Finetuned Model Precision: 0.848
Finetuned Model Recall: 0.897
Finetuned Model F1: 0.872


In [72]:
# exploring the distribution of the predictions - avoiding overfitting
y_val_pred_int = y_val_pred.astype(int)

counts = np.bincount(y_val_pred_int)

print("Number of non-baselines:", counts[0])
print("Number of baselines:", counts[1])

assert counts[0] > 0, "Model has predicted no non-baselines. This is likely due to the model being overfit."

Number of non-baselines: 148
Number of baselines: 664


In [73]:
# saving model
dump(original_model, data_path/f'nn_model_{transferred_site}_finetuned.joblib')

['C:\\Users\\kirst\\OneDrive\\Kirstin\\Uni\\Year4\\MSciProject\\data_files\\saved_files\\nn_model_GSN_finetuned.joblib']

### Retuning Model Completely

In [74]:
new_model = MLPClassifier(random_state=42,
                         max_iter=1000, 
                         hidden_layer_sizes=(100,), 
                         shuffle=False,
                         activation='relu', 
                         solver='adam', 
                         alpha=0.05, 
                         learning_rate='constant', 
                         batch_size=100, 
                         early_stopping=True,
                         learning_rate_init=0.001,
                         beta_2=0.9,)

new_model.fit(X_train, y_train)

y_val_pred = new_model.predict(X_val)
y_train_pred = new_model.predict(X_train)

precision_val = precision_score(y_val, y_val_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_val = recall_score(y_val, y_val_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_val = f1_score(y_val, y_val_pred)
f1_train = f1_score(y_train, y_train_pred)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")

Precision on Training Set = 0.856
Precision on Validation Set = 0.794
Recall on Training Set = 0.984
Recall on Validation Set = 0.993
F1 Score on Training Set = 0.916
F1 Score on Validation Set = 0.882


In [75]:
# evaluating model on test set
y_test_pred = new_model.predict(X_test)

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Test Precision: {precision:.3f}")
print(f"Test Recall: {recall:.3f}")
print(f"Test F1 Score: {f1:.3f}")

Test Precision: 0.799
Test Recall: 0.985
Test F1 Score: 0.882
