# Finetuning of Random Forest Model for a New Site

In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from joblib import load, dump
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import sys
sys.path.append('../')
import config

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

In [15]:
site = 'MHD'
site_name = config.site_dict[site]

transferred_site = 'GSN'
transferred_site_name = config.site_dict[transferred_site]

compound = config.compound

print(f"Finetuning a random forest model based on \033[1m{site_name}\033[0;0m to be applicable at \033[1m{transferred_site_name}\033[0;0m.")

Finetuning a random forest model based on [1mMace Head, Ireland[0;0m to be applicable at [1mGosan, South Korea[0;0m.


### Loading in Model & Initialising Data

In [16]:
# loading in model
mhd_model = load(data_path/f'rf_model_{site}.joblib')

# loading in training data
mhd_data = pd.read_csv(data_path/f'for_model_pca_{compound}_{site}.csv', parse_dates=['time'])

train_data = mhd_data[(mhd_data['time'].dt.year >= 2016) & (mhd_data['time'].dt.year <= 2018)]

train_data = train_data.drop(columns=['time'])
X_train_MHD = train_data.drop(columns=['flag'])
y_train_MHD = train_data['flag']

y_train_MHD = y_train_MHD.dropna()
X_train_MHD = X_train_MHD.loc[y_train_MHD.index]

# training the model on MHD training data
mhd_model.fit(X_train_MHD, y_train_MHD)

In [17]:
# loading in data
data = pd.read_csv(data_path/f'for_model_pca_{compound}_{transferred_site}.csv',
                   parse_dates=['time'])
data.sample(5)

Unnamed: 0,time,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,flag
3799,2010-12-04 17:00:00,-3.294094,-2.31851,-9.387593,-0.874306,-1.532845,1.221644,-0.130776,1.854754,-0.312494,...,1.92788,-0.268347,-1.867585,0.050442,2.041996,-0.162285,0.92887,1.04939,-1.218149,1.0
10624,2017-11-01 21:00:00,-5.243195,0.887696,-3.058068,2.86575,-0.905483,-0.50194,-2.684658,0.06042,-0.064407,...,0.730701,2.041881,0.125418,1.154295,1.82041,-0.928375,0.360848,-1.538721,-1.011875,1.0
5979,2012-12-05 23:00:00,10.616668,4.73662,0.006338,0.966348,-2.935776,-5.1237,0.646602,3.28263,-0.675827,...,-1.924348,1.698157,4.766678,-1.839353,1.082081,-3.90598,-0.35975,0.447073,-1.559378,1.0
2149,2009-09-02 13:00:00,-10.136958,-6.36342,3.072996,4.6795,-3.62037,-3.216948,0.034568,-4.401476,-0.870358,...,-1.021462,-1.389418,1.048921,-0.705923,-1.581204,-2.443585,0.416531,-1.347377,-1.883893,1.0
7027,2013-11-27 11:00:00,5.931783,10.676172,-4.343182,2.794374,-0.972233,0.367602,-1.429255,-1.820862,0.330571,...,2.093504,0.556513,0.055047,-1.380812,-0.801746,0.773093,-0.588358,1.002584,0.300985,1.0


In [18]:
# setting up data for finetuning
train_data_ft = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2014)]
val_data_ft = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2015)]
test_data_ft = data[(data['time'].dt.year >= 2016) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data_ft['time'].min()} -> {train_data_ft['time'].max()}. Length: {len(train_data_ft)}")
print(f"Val range: {val_data_ft['time'].min()} -> {val_data_ft['time'].max()}. Length: {len(val_data_ft)}")
print(f"Test range: {test_data_ft['time'].min()} -> {test_data_ft['time'].max()}. Length: {len(test_data_ft)}")

train_data_ft = train_data_ft.drop(columns=['time'])
val_data_ft = val_data_ft.drop(columns=['time'])
test_data_ft = test_data_ft.drop(columns=['time'])

X_train_ft = train_data_ft.drop(columns=['flag'])
y_train_ft = train_data_ft['flag']
X_val_ft = val_data_ft.drop(columns=['flag'])
y_val_ft = val_data_ft['flag']
X_test_ft = test_data_ft.drop(columns=['flag'])
y_test_ft = test_data_ft['flag']

y_train_ft = y_train_ft.dropna()
y_val_ft = y_val_ft.dropna()
y_test_ft = y_test_ft.dropna()

X_train_ft = X_train_ft.loc[y_train_ft.index]
X_val_ft = X_val_ft.loc[y_val_ft.index]
X_test_ft = X_test_ft.loc[y_test_ft.index]

Train range: 2014-01-02 09:00:00 -> 2014-12-31 23:00:00. Length: 915
Val range: 2015-01-01 01:00:00 -> 2015-12-31 19:00:00. Length: 813
Test range: 2016-01-01 11:00:00 -> 2017-12-31 23:00:00. Length: 1995


In [19]:
# setting up data for full retuning
train_data = data[(data['time'].dt.year >= 2011) & (data['time'].dt.year <= 2013)]
val_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2014)]
test_data = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Val range: {val_data['time'].min()} -> {val_data['time'].max()}. Length: {len(val_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

train_data = train_data.drop(columns=['time'])
val_data = val_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_val = val_data.drop(columns=['flag'])
y_val = val_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

y_train = y_train.dropna()
y_val = y_val.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_val = X_val.loc[y_val.index]
X_test = X_test.loc[y_test.index]

Train range: 2011-01-02 17:00:00 -> 2013-12-30 11:00:00. Length: 3215
Val range: 2014-01-02 09:00:00 -> 2014-12-31 23:00:00. Length: 915
Test range: 2015-01-01 01:00:00 -> 2017-12-31 23:00:00. Length: 2808


### Testing Existing Model on Data

In [20]:
class_probabilities_val = mhd_model.predict_proba(X_val)
confidence_threshold = config.confidence_threshold

y_val_pred = class_probabilities_val[:, 1] > confidence_threshold

In [21]:
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print(f"{site} Model Precision: {precision:.3f}")
print(f"{site} Model Recall: {recall:.3f}")
print(f"{site} Model F1 Score: {f1:.3f}")

MHD Model Precision: 0.844
MHD Model Recall: 0.433
MHD Model F1 Score: 0.573


### Finetuning Model

In [22]:
# allowing warm start & therefore fine-tuning
mhd_model.warm_start = True

# adding more trees to the forest
mhd_model.n_estimators += 50

# fitting the model to the new data (one years worth)
mhd_model.fit(X_train_ft, y_train_ft)

In [23]:
# evaluating on the validation set
class_probabilities_val = mhd_model.predict_proba(X_val_ft)
confidence_threshold = config.confidence_threshold

y_val_pred_ft = class_probabilities_val[:, 1] > confidence_threshold

precision = precision_score(y_val_ft, y_val_pred_ft)
recall = recall_score(y_val_ft, y_val_pred_ft)
f1 = f1_score(y_val_ft, y_val_pred_ft)

print(f"Finetuned Model Precision: {precision:.3f}")
print(f"Finetuned Model Recall: {recall:.3f}")
print(f"Finetuned Model F1 Score: {f1:.3f}")

Finetuned Model Precision: 0.857
Finetuned Model Recall: 0.530
Finetuned Model F1 Score: 0.655


In [24]:
# evaluating on the test set
class_probabilities_test = mhd_model.predict_proba(X_test_ft)
confidence_threshold = config.confidence_threshold

y_test_pred_ft = class_probabilities_test[:, 1] > confidence_threshold

precision = precision_score(y_test_ft, y_test_pred_ft)
recall = recall_score(y_test_ft, y_test_pred_ft)
f1 = f1_score(y_test_ft, y_test_pred_ft)

print(f"Finetuned Model Precision: {precision:.3f}")
print(f"Finetuned Model Recall: {recall:.3f}")
print(f"Finetuned Model F1 Score: {f1:.3f}")

Finetuned Model Precision: 0.920
Finetuned Model Recall: 0.572
Finetuned Model F1 Score: 0.705


In [25]:
# saving model
dump(mhd_model, data_path/f'rf_model_{transferred_site}_finetuned.joblib')

['C:\\Users\\kirst\\OneDrive\\Kirstin\\Uni\\Year4\\MSciProject\\data_files\\saved_files\\rf_model_GSN_finetuned.joblib']

### Retuning Model Completely

In [26]:
# retuning model completely using same hyperparameters as original model
gsn_model = RandomForestClassifier(random_state=42,
                                  n_estimators=100,
                                  max_depth=5,
                                  criterion='entropy',
                                  bootstrap=False,)

gsn_model.fit(X_train, y_train)

class_probabilities_val = gsn_model.predict_proba(X_val)
class_probabilites_train = gsn_model.predict_proba(X_train)

confidence_threshold = config.confidence_threshold

y_pred_val = (class_probabilities_val[:, 1] >= confidence_threshold).astype(int)
y_pred_train = (class_probabilites_train[:, 1] >= confidence_threshold).astype(int)

precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")

Precision on Training Set = 0.975
Precision on Validation Set = 0.920
Recall on Training Set = 0.692
Recall on Validation Set = 0.555
F1 Score on Training Set = 0.810
F1 Score on Validation Set = 0.693


In [27]:
# evaluating on test set
class_probabilities_test = gsn_model.predict_proba(X_test)

y_pred_test = class_probabilities_test[:, 1] > confidence_threshold

precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print(f"Test Precision: {precision:.3f}")
print(f"Test Recall: {recall:.3f}")
print(f"Test F1 Score: {f1:.3f}")

Test Precision: 0.934
Test Recall: 0.674
Test F1 Score: 0.783
