# Finetuning of Random Forest Model for a New Site

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from joblib import load, dump
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import sys
sys.path.append('../')
import config

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

In [2]:
site = 'MHD'
site_name = config.site_dict[site]

transferred_site = 'GSN'
transferred_site_name = config.site_dict[transferred_site]

compound = config.compound

print(f"Finetuning a random forest model based on \033[1m{site_name}\033[0;0m to be applicable at \033[1m{transferred_site_name}\033[0;0m.")

Finetuning a random forest model based on [1mMace Head, Ireland[0;0m to be applicable at [1mGosan, South Korea[0;0m.


### Loading in Model & Initialising Data

In [4]:
# loading in model
mhd_model = load(data_path/f'rf_model_{site}.joblib')

# loading in training data
mhd_data = pd.read_csv(data_path/f'for_model_pca_{compound}_{site}.csv', parse_dates=['time'])

train_data = mhd_data[(mhd_data['time'].dt.year >= 2016) & (mhd_data['time'].dt.year <= 2018)]

train_data = train_data.drop(columns=['time'])
X_train_MHD = train_data.drop(columns=['flag'])
y_train_MHD = train_data['flag']

y_train_MHD = y_train_MHD.dropna()
X_train_MHD = X_train_MHD.loc[y_train_MHD.index]

# training the model on MHD training data
mhd_model.fit(X_train_MHD, y_train_MHD)

In [5]:
# loading in data
data = pd.read_csv(data_path/f'for_model_pca_{compound}_{transferred_site}.csv',
                   parse_dates=['time'])
data.sample(5)

Unnamed: 0,time,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,flag
304,2008-01-05 21:00:00,-2.385125,-0.111241,-6.937173,0.273405,-1.433098,-0.506298,1.154096,1.180557,1.846979,...,0.109169,0.215815,-1.118216,1.268819,2.346961,3.523459,-0.530195,0.665794,1.020864,0.0
2664,2010-01-18 07:00:00,-2.451658,-1.604447,-5.694979,-3.085729,-0.927221,0.224731,0.509939,0.835328,-1.122412,...,0.025882,-0.297745,3.244814,1.429753,0.610964,-1.085532,-0.534003,-1.038684,-0.362459,0.0
9375,2016-07-07 03:00:00,9.850216,-3.620169,-0.025611,2.740479,4.00255,3.529634,-0.258169,-0.237107,0.112322,...,-0.000716,0.16253,2.425427,0.311691,-1.427594,-1.890885,-0.858708,0.000766,0.894686,1.0
6616,2013-07-10 17:00:00,8.457865,0.697555,-5.36558,-1.660387,3.079716,2.761038,1.145323,0.028995,-0.346681,...,-1.927914,-2.485185,0.000841,-1.277625,1.12852,0.322084,0.4165,-0.058984,2.009921,1.0
1805,2009-03-26 03:00:00,-4.472624,0.360981,-4.23153,-1.498872,-3.185457,-0.234404,2.624072,0.48824,0.458722,...,0.227694,-0.025541,0.8967,0.935309,0.112457,-1.831346,0.191114,-0.391952,0.439655,1.0


In [15]:
# setting up data for finetuning
train_data_ft = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2014)]
val_data_ft = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2015)]
test_data_ft = data[(data['time'].dt.year >= 2016) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data_ft['time'].min()} -> {train_data_ft['time'].max()}. Length: {len(train_data_ft)}")
print(f"Val range: {val_data_ft['time'].min()} -> {val_data_ft['time'].max()}. Length: {len(val_data_ft)}")
print(f"Test range: {test_data_ft['time'].min()} -> {test_data_ft['time'].max()}. Length: {len(test_data_ft)}")

train_data_ft = train_data_ft.drop(columns=['time'])
val_data_ft = val_data_ft.drop(columns=['time'])
test_data_ft = test_data_ft.drop(columns=['time'])

X_train_ft = train_data_ft.drop(columns=['flag'])
y_train_ft = train_data_ft['flag']
X_val_ft = val_data_ft.drop(columns=['flag'])
y_val_ft = val_data_ft['flag']
X_test_ft = test_data_ft.drop(columns=['flag'])
y_test_ft = test_data_ft['flag']

y_train_ft = y_train_ft.dropna()
y_val_ft = y_val_ft.dropna()
y_test_ft = y_test_ft.dropna()

X_train_ft = X_train_ft.loc[y_train_ft.index]
X_val_ft = X_val_ft.loc[y_val_ft.index]
X_test_ft = X_test_ft.loc[y_test_ft.index]

Train range: 2014-01-02 09:00:00 -> 2014-12-31 23:00:00. Length: 915
Val range: 2015-01-01 01:00:00 -> 2015-12-31 19:00:00. Length: 813
Test range: 2016-01-01 11:00:00 -> 2017-12-31 23:00:00. Length: 1995


In [16]:
# setting up data for full retuning
train_data = data[(data['time'].dt.year >= 2011) & (data['time'].dt.year <= 2013)]
val_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2014)]
test_data = data[(data['time'].dt.year >= 2015) & (data['time'].dt.year <= 2017)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Val range: {val_data['time'].min()} -> {val_data['time'].max()}. Length: {len(val_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

train_data = train_data.drop(columns=['time'])
val_data = val_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_val = val_data.drop(columns=['flag'])
y_val = val_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

y_train = y_train.dropna()
y_val = y_val.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_val = X_val.loc[y_val.index]
X_test = X_test.loc[y_test.index]

Train range: 2011-01-02 17:00:00 -> 2013-12-30 11:00:00. Length: 3215
Val range: 2014-01-02 09:00:00 -> 2014-12-31 23:00:00. Length: 915
Test range: 2015-01-01 01:00:00 -> 2017-12-31 23:00:00. Length: 2808


### Testing Existing Model on Data

In [17]:
class_probabilities_val = mhd_model.predict_proba(X_val)
confidence_threshold = config.confidence_threshold

y_val_pred = class_probabilities_val[:, 1] > confidence_threshold

In [18]:
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print(f"{site} Model Precision: {precision:.3f}")
print(f"{site} Model Recall: {recall:.3f}")
print(f"{site} Model F1 Score: {f1:.3f}")

MHD Model Precision: 0.770
MHD Model Recall: 0.510
MHD Model F1 Score: 0.614


### Finetuning Model

In [19]:
# allowing warm start & therefore fine-tuning
mhd_model.warm_start = True

# adding more trees to the forest
mhd_model.n_estimators += 50

# fitting the model to the new data (one years worth)
mhd_model.fit(X_train_ft, y_train_ft)

In [20]:
# evaluating on the validation set
class_probabilities_val = mhd_model.predict_proba(X_val_ft)
confidence_threshold = config.confidence_threshold

y_val_pred_ft = class_probabilities_val[:, 1] > confidence_threshold

precision = precision_score(y_val_ft, y_val_pred_ft)
recall = recall_score(y_val_ft, y_val_pred_ft)
f1 = f1_score(y_val_ft, y_val_pred_ft)

print(f"Finetuned Model Precision: {precision:.2f}")
print(f"Finetuned Model Recall: {recall:.2f}")
print(f"Finetuned Model F1 Score: {f1:.2f}")

Finetuned Model Precision: 0.89
Finetuned Model Recall: 0.39
Finetuned Model F1 Score: 0.55


In [21]:
# saving model
dump(mhd_model, data_path/f'rf_model_{transferred_site}_finetuned.joblib')

['C:\\Users\\kirst\\OneDrive\\Kirstin\\Uni\\Year4\\MSciProject\\data_files\\saved_files\\rf_model_GSN_finetuned.joblib']

### Retuning Model Completely

In [20]:
# retuning model completely using same hyperparameters as original model
gsn_model = RandomForestClassifier(random_state=42,
                                  n_estimators=100,
                                  max_depth=5,
                                  criterion='entropy',
                                  bootstrap=False,)

gsn_model.fit(X_train, y_train)

class_probabilities_val = gsn_model.predict_proba(X_val)
class_probabilites_train = gsn_model.predict_proba(X_train)

confidence_threshold = config.confidence_threshold

y_pred_val = (class_probabilities_val[:, 1] >= confidence_threshold).astype(int)
y_pred_train = (class_probabilites_train[:, 1] >= confidence_threshold).astype(int)

precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")

Precision on Training Set = 0.970
Precision on Validation Set = 0.925
Recall on Training Set = 0.768
Recall on Validation Set = 0.589
F1 Score on Training Set = 0.857
F1 Score on Validation Set = 0.719


In [21]:
# evaluating on test set
class_probabilities_test = gsn_model.predict_proba(X_test)

y_pred_test = class_probabilities_test[:, 1] > confidence_threshold

precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print(f"Test Precision: {precision:.3f}")
print(f"Test Recall: {recall:.3f}")
print(f"Test F1 Score: {f1:.3f}")

Test Precision: 0.923
Test Recall: 0.678
Test F1 Score: 0.782
