# Random Forest

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from joblib import dump
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from pathlib import Path

import sys
sys.path.append('../')
import config

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

In [11]:
site = config.site
site_name = config.site_dict[site]
compound = config.compound

print(f"Exploring {compound} at {site_name}.")

Exploring ch2cl2 at Mace Head, Ireland.


### Initialising

In [12]:
# initialising WandB
import wandb, os

os.environ["WANDB_API_KEY"] = "e84d2e19bd2cc42ec6e5d232cd0b6f0fe41f2189"
os.environ["WANDB_NOTEBOOK_NAME"] = "randomforest_models.ipynb"

In [13]:
# Load the data
data = pd.read_csv(data_path/'for_model_pca.csv', parse_dates=['time'])

data.sample(5)

Unnamed: 0,time,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC12,PC13,PC14,PC15,PC16,PC17,PC18,flag,time_of_day,day_of_year
15464,2015-03-31 00:52:00,2.867043,-11.976351,0.684946,2.444204,0.093187,-4.050718,3.252139,0.187214,1.383329,...,-0.54523,-0.955277,-3.793374,-1.053283,-0.419364,1.496104,0.73041,0.0,0,90
6452,2006-11-15 05:32:00,7.05243,3.000826,0.266842,-0.801191,-5.75618,0.927695,-3.621805,1.136709,0.104617,...,0.969464,-1.565086,-1.297604,0.292161,0.067798,0.185669,0.59663,1.0,5,319
8622,2008-09-28 12:00:00,-5.826482,1.338654,-6.105812,-3.404951,-2.371607,-0.525008,-1.159893,-0.567076,0.384976,...,0.095836,-0.232711,-2.678796,0.81718,1.298305,0.1812,0.318669,1.0,12,272
7248,2007-07-02 19:19:00,2.121472,0.338158,6.109806,1.325609,-0.974237,-1.554441,-0.09753,-0.247522,-1.361754,...,0.269012,-0.023794,0.899301,-0.457674,0.856986,-1.208764,-1.187257,0.0,19,183
5420,2005-10-02 13:05:00,-7.313638,-1.263743,-7.535617,-0.393748,1.706816,-0.068069,0.026535,-0.050604,0.026414,...,0.194476,-0.556628,-0.059205,1.238237,-0.954051,0.555665,-0.963319,1.0,13,275


In [14]:
if compound == 'ch2cl2':
    train_data = data[(data['time'].dt.year >= 2010) & (data['time'].dt.year <= 2012)]
    val_data = data[(data['time'].dt.year >= 2013) & (data['time'].dt.year <= 2013)]
    test_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2016)]

else:
    train_data = data[(data['time'].dt.year >= 2016) & (data['time'].dt.year <= 2018)]
    val_data = data[(data['time'].dt.year >= 2019) & (data['time'].dt.year <= 2019)]
    test_data = data[(data['time'].dt.year >= 2020) & (data['time'].dt.year <= 2023)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Val range: {val_data['time'].min()} -> {val_data['time'].max()}. Length: {len(val_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

# saving the date ranges for WandB tracking
# training_date_range = "2016"
# validation_date_range = "2020-01-01 to 2020-12-31"
# testing_date_range = "2020-01-01 to 2022-12-31"

# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
val_data = val_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_val = val_data.drop(columns=['flag'])
y_val = val_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

# Balanced Data - removing NaN values and associated data
y_train = y_train.dropna()
y_val = y_val.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_val = X_val.loc[y_val.index]
X_test = X_test.loc[y_test.index]

Train range: 2010-01-02 04:02:00 -> 2012-12-31 22:29:00. Length: 3295
Val range: 2013-01-01 00:29:00 -> 2013-12-30 09:25:00. Length: 847
Test range: 2014-01-02 20:09:00 -> 2016-12-31 21:51:00. Length: 3285


## Models

### Basic Model with Default Parameters

Shows very basic set-up of model, and how the train/validation/train process works.

In [None]:
# wandb.init(project="Random Forest 2.0")

# setting up initial model on training data
rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)

# making predictions on training and validation data
y_pred_val = rf_model.predict(X_val)
y_pred_train = rf_model.predict(X_train)

# calculating metrics
precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_score_val = f1_score(y_val, y_pred_val)
f1_score_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_score_train:.3f}")
print(f"F1 Score on Validation Set = {f1_score_val:.3f}")

# wandb.log({"model_name":"Basic model, default parameters", "training_precision":precision_train, "testing_precision":precision_test, 
            # "training_recall":recall_train, "testing_recall":recall_test, "training_f1":f1_score_train, "testing_f1":f1_score_test,
            # "training date range": training_date_range, "testing date range": testing_date_range})

# wandb.finish()

In [None]:
# now making predictions on the test data
y_pred_test = rf_model.predict(X_test)

# calculating metrics
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

print(f"Precision on Test Set = {precision_test:.3f}")
print(f"Recall on Test Set = {recall_test:.3f}")
print(f"F1 Score on Test Set = {f1_score_test:.3f}")

### Class Probabilities Model

Focusing on a model that makes predictions based on class probabilities in order to optimise precision.

#### Initial Model

In [None]:
# wandb.init(project="Random Forest 2.0")

rf_model = RandomForestClassifier(random_state=42, n_estimators=200)

rf_model.fit(X_train, y_train)

# finding class probabilities for each sample - using validation set
class_probabilities_val = rf_model.predict_proba(X_val)
class_probabilites_train = rf_model.predict_proba(X_train)

confidence_threshold = config.confidence_threshold

# make predictions based on threshold
y_pred_val = (class_probabilities_val[:, 1] >= confidence_threshold).astype(int)
y_pred_train = (class_probabilites_train[:, 1] >= confidence_threshold).astype(int)


# Calculate scores
precision_train = precision_score(y_train, y_pred_train)
precision_val = precision_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Testing Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Testing Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Testing Set = {f1_val:.3f}")


# wandb.log({"model_name":"Class probabilities model", "training_precision":precision_train_threshold, "testing_precision":precision_test_threshold, 
            # "training_recall":recall_train_threshold, "testing_recall":recall_test_threshold, "training_f1":f1_train_threshold, "testing_f1":f1_test_threshold,
            # "training date range": training_date_range, "testing date range": testing_date_range})

# wandb.finish()

#### Optimising Model Using Validation Set

##### Optimising Threshold

In [None]:
confidence_threshold_vals = [0.5, 0.6, 0.7, 0.75, 0.8, 0.9]

train_precisions_values = []
val_precisions_values = []
train_recalls_values = []
val_recalls_values = []

directory = './results'
file_path = os.path.join(directory, 'threshold_results.csv')

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # adding header
    writer.writerow(['Confidence Threshold', 'Training Precision', 'Validation Precision', 'Training Recall', 'Validation Recall'])

    for threshold in confidence_threshold_vals:
        rf_model = RandomForestClassifier(random_state=42)
        rf_model.fit(X_train, y_train)

        class_probabilities_val = rf_model.predict_proba(X_val)
        class_probabilites_train = rf_model.predict_proba(X_train)
        
        # make predictions based on threshold
        y_pred_val = (class_probabilities_val[:, 1] >= threshold).astype(int)
        y_pred_train = (class_probabilites_train[:, 1] >= threshold).astype(int)

        # calculating precision & recall of validation/training sets
        val_precision = precision_score(y_val, y_pred_val)
        train_precision = precision_score(y_train, y_pred_train)
        val_recall = recall_score(y_val, y_pred_val)
        train_recall = recall_score(y_train, y_pred_train)

        train_precisions_values.append(train_precision)
        val_precisions_values.append(val_precision)
        train_recalls_values.append(train_recall)
        val_recalls_values.append(val_recall)

        writer.writerow([threshold, train_precision, val_precision, train_recall, val_recall])

# reading in results
results = pd.read_csv('results/threshold_results.csv')

# plotting
fig, axs = plt.subplots(2,1, figsize=(10, 8))
sns.set(style='darkgrid')
sns.set_palette("colorblind")

# line plot of precision vs. max_depth (training and validation)
axs[0].plot(results['Confidence Threshold'], results['Training Precision'], label='Training Precision', marker='x')
axs[0].plot(results['Confidence Threshold'], results['Validation Precision'], label='Validation Precision', marker='x')
axs[0].set_xlabel('confidence threshold')
axs[0].set_xticks(results['Confidence Threshold'])
axs[0].set_xticklabels(results['Confidence Threshold'])
axs[0].set_ylabel('Precision')
axs[0].legend()
axs[0].set_title('Training and Validation Precision vs. Confidence Threshold', fontstyle='italic', fontsize=14)

# line plot of recall vs. max_depth (training and validation)
axs[1].plot(results['Confidence Threshold'], results['Training Recall'], label='Training Recall', marker='x')
axs[1].plot(results['Confidence Threshold'], results['Validation Recall'], label='Validation Recall', marker='x')
axs[1].set_xlabel('confidence threshold')
axs[1].set_xticks(results['Confidence Threshold'])
axs[1].set_xticklabels(results['Confidence Threshold'])
axs[1].set_ylabel('Recall')
axs[1].legend()
axs[1].set_title('Training and Validation Recall vs. Confidence Threshold', fontstyle='italic', fontsize=14)

fig.tight_layout()

##### Exploring max_depth

This particular hyperparameter has shown to help significantly with overfitting issues so this is explored first.

In [None]:
# List of max_depth values to try
max_depth_values = [3, 5, 6, 7, 8, 9, 10, 12, 15, 20]

train_precisions_values = []
val_precisions_values = []
difference_values = []

directory = './results'
file_path = os.path.join(directory, 'max_depth_results.csv')

# creating csv file to store results - allows for easy plotting/analysis
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # adding header
    writer.writerow(['Max Depth', 'Training Precision', 'Validation Precision', 'Difference'])

    for max_depth in max_depth_values:
        rf_model = RandomForestClassifier(random_state=42, max_depth=max_depth, n_estimators=100, criterion='entropy')
        rf_model.fit(X_train, y_train)

        # finding class probabilities for each sample
        class_probabilities_val = rf_model.predict_proba(X_val)
        class_probabilites_train = rf_model.predict_proba(X_train)

        confidence_threshold = config.confidence_threshold

        # make predictions based on threshold
        y_pred_val = (class_probabilities_val[:, 1] >= confidence_threshold).astype(int)
        y_pred_train = (class_probabilites_train[:, 1] >= confidence_threshold).astype(int)

        # calculating precision of testing/training sets
        val_precision = precision_score(y_val, y_pred_val)
        train_precision = precision_score(y_train, y_pred_train)
        difference = abs(train_precision-val_precision)

        train_precisions_values.append(train_precision)
        val_precisions_values.append(val_precision)
        difference_values.append(difference)

        writer.writerow([max_depth, train_precision, val_precision, difference])


results = pd.read_csv('results/max_depth_results.csv')

# plotting
fig, axs = plt.subplots(2,1, figsize=(10, 8))
sns.set(style='darkgrid')
sns.set_palette("colorblind")

# line plot of precision vs. max_depth (training and validation)
axs[0].plot(results['Max Depth'], results['Training Precision'], label='Training Precision', marker='x')
axs[0].plot(results['Max Depth'], results['Validation Precision'], label='Validation Precision', marker='x')
axs[0].set_xlabel('max_depth')
axs[0].set_xticks(results['Max Depth'])
axs[0].set_xticklabels(results['Max Depth'].astype(int))
axs[0].set_ylabel('Precision')
axs[0].legend()
axs[0].set_title('Training and Validation Precision vs. max_depth', fontstyle='italic', fontsize=14)

# bar chart of difference between training and validation precision
axs[1].bar(results['Max Depth'], results['Difference'])

# Add data labels to the bar chart
for i, v in zip(results['Max Depth'], results['Difference']):
    axs[1].text(i, v, f"{v:.2f}", fontsize=8, ha='center', va='bottom')

axs[1].set_xlabel('max_depth')
axs[1].set_xticks(results['Max Depth'])
axs[1].set_xticklabels(results['Max Depth'].astype(int))

axs[1].set_ylabel('Difference')
axs[1].set_title('Difference between Training and Validation Precision vs. max_depth', fontstyle='italic', fontsize=14)

fig.tight_layout()

##### Grid Search

In [None]:
model = RandomForestClassifier(random_state=42)

# Define the hyperparameters to search over
'''
INITIAL GRID
param_grid = {'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']}
'''
param_grid = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap' : [True, False]
}

# Initialize the grid search with 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=5, scoring='f1')

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# extracting best parameters and score
results = grid_search.best_estimator_

validation_f1 = results.score(X_val, y_val)

print(f'Validation F1 Score: {validation_f1:.3f}')
print(f'Best Parameters: {grid_search.best_params_}')

#### Applying Optimisations to Model

In [15]:
# wandb.init(project="Random Forest 2.0")

rf_model = RandomForestClassifier(random_state=42,
                                  n_estimators=100,
                                  max_depth=5,
                                  criterion='entropy',
                                  bootstrap=False,)

rf_model.fit(X_train, y_train)

# making predictions based on confidence threshold
class_probabilities_val = rf_model.predict_proba(X_val)
class_probabilites_train = rf_model.predict_proba(X_train)

confidence_threshold = config.confidence_threshold

y_pred_val = (class_probabilities_val[:, 1] >= confidence_threshold).astype(int)
y_pred_train = (class_probabilites_train[:, 1] >= confidence_threshold).astype(int)


# calculating performance metrics
precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y_train, y_pred_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")


# wandb.log({"model_name":"Optimising hyperparameters", "training_precision":precision_train, "testing_precision":precision_test, 
            # "training_recall":recall_train, "testing_recall":recall_test, "training_f1":f1_train, "testing_f1":f1_test,
            # "training date range": training_date_range, "testing date range": testing_date_range})

# wandb.finish()

Precision on Training Set = 0.958
Precision on Validation Set = 0.946
Recall on Training Set = 0.805
Recall on Validation Set = 0.773
F1 Score on Training Set = 0.875
F1 Score on Validation Set = 0.850


##### Evaluating Model on Test Data

In [16]:
class_probabilities_test = rf_model.predict_proba(X_test)

y_pred_test = (class_probabilities_test[:, 1] >= confidence_threshold).astype(int)

precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"Precision on Test Set = {precision_test:.3f}")
print(f"Recall on Test Set = {recall_test:.3f}")
print(f"F1 Score on Test Set = {f1_test:.3f}")

Precision on Test Set = 0.933
Recall on Test Set = 0.739
F1 Score on Test Set = 0.825


In [17]:
# Save the model
dump(rf_model, data_path/'rf_model.joblib')

['C:\\Users\\kirst\\OneDrive\\Kirstin\\Uni\\Year4\\MSciProject\\data_files\\saved_files\\rf_model.joblib']