# Random Forest

In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix
from joblib import dump
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import numpy as np
from pathlib import Path

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

### Initialising

In [7]:
# initialising WandB
import wandb, os

os.environ["WANDB_API_KEY"] = "e84d2e19bd2cc42ec6e5d232cd0b6f0fe41f2189"
os.environ["WANDB_NOTEBOOK_NAME"] = "randomforest_models.ipynb"

In [8]:
# Load the data
data = pd.read_csv(data_path/'for_model.csv', parse_dates=['time'])

data.sample(5)

Unnamed: 0,time,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC101,PC102,PC103,PC104,PC105,PC106,PC107,PC108,PC109,flag
72078,2021-09-16 07:13:00,1.063864,-0.187245,-2.875592,1.361285,3.812803,-0.161873,-1.498231,0.196169,1.587617,...,0.11947,-0.075823,0.174193,0.096638,0.244177,0.495921,-0.154696,0.068816,-0.052675,0.0
67923,2020-06-29 17:34:00,-1.916093,4.427363,4.146447,3.574501,-1.685173,-2.358644,-0.526342,-0.102479,2.286268,...,-0.387911,0.515129,0.262087,-0.083945,0.114399,-0.021913,0.3663,0.119621,-0.215011,0.0
68653,2020-09-11 07:53:00,4.046572,6.234843,-6.258175,-4.84666,-1.020094,-1.671646,0.065773,-0.803114,-0.611638,...,-0.227297,0.092327,-0.03639,0.162701,-0.024108,0.215849,0.05649,0.162248,0.040704,1.0
25072,2008-02-17 06:56:00,-3.184012,-6.275028,-6.925078,-0.066497,-3.11925,0.095619,2.198851,-0.294625,-1.257038,...,-0.314491,-0.397673,-0.203067,-0.005911,0.07317,-0.073312,-0.254194,0.046585,-0.270059,0.0
39591,2012-01-06 06:46:00,-1.658394,6.253611,-4.817703,3.450677,3.623123,0.538994,-1.781246,2.405096,-2.869001,...,-0.094553,0.225202,0.379528,-0.155136,-0.194717,-0.101408,-0.058377,0.240821,0.288948,0.0


In [9]:
# Convert "time" column to datetime format
#data['time'] = pd.to_datetime(data['time'], format='%d/%m/%Y %H:%M')

# Split the data into training and testing sets based on the date
train_data = data[(data['time'].dt.year >= 2017) & (data['time'].dt.year <= 2019)]
test_data = data[(data['time'].dt.year >= 2020) & (data['time'].dt.year <= 2022)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

# saving the date ranges for WandB tracking
training_date_range = "2017-01-01 to 2019-12-31"
testing_date_range = "2020-01-01 to 2022-12-31"

# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

# Balanced Data - removing NaN values and associated data
y_train = y_train.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_test = X_test.loc[y_test.index]

Train range: 2017-01-01 00:01:00 -> 2019-12-31 22:31:00. Length: 9942
Test range: 2020-01-01 00:41:00 -> 2022-12-31 21:57:00. Length: 9561


### Basic Model with Default Parameters

In [10]:
# wandb.init(project="Random Forest 2.0")

rf_model = RandomForestClassifier(random_state=42)

# fitting model and making predictions
rf_model.fit(X_train, y_train)

y_pred_rf_test = rf_model.predict(X_test)
y_pred_rf_train = rf_model.predict(X_train)

# calculating metrics
precision_test = precision_score(y_test, y_pred_rf_test)
precision_train = precision_score(y_train, y_pred_rf_train)
recall_test = recall_score(y_test, y_pred_rf_test)
recall_train = recall_score(y_train, y_pred_rf_train)
f1_score_test = f1_score(y_test, y_pred_rf_test)
f1_score_train = f1_score(y_train, y_pred_rf_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Testing Set = {precision_test:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Testing Set = {recall_test:.3f}")
print(f"F1 Score on Training Set = {f1_score_train:.3f}")
print(f"F1 Score on Testing Set = {f1_score_test:.3f}")


wandb.log({"model_name":"Basic model, default parameters", "training_precision":precision_train, "testing_precision":precision_test, 
            "training_recall":recall_train, "testing_recall":recall_test, "training_f1":f1_score_train, "testing_f1":f1_score_test,
            "training date range": training_date_range, "testing date range": testing_date_range})

wandb.finish()

Precision on Training Set = 1.000
Precision on Testing Set = 0.895
Recall on Training Set = 0.589
Recall on Testing Set = 0.043
F1 Score on Training Set = 0.741
F1 Score on Testing Set = 0.082
Model precision = 0.895
Model recall = 0.043
Model f1 score = 0.082


### Class Probabilities Model

#### Exploring Thresholds

In [None]:
# exploring confidence threshold parameter

# List of max_depth values to try
confidence_threshold_vals = [0.5, 0.6, 0.7, 0.75, 0.8, 0.9]

train_precisions_values = []
test_precisions_values = []
difference_values = []

directory = './results'
file_path = os.path.join(directory, 'threshold_results.csv')

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # adding header
    writer.writerow(['Confidence Threshold', 'Training Precision', 'Testing Precision', 'Training Recall', 'Testing Recall'])

    for threshold in confidence_threshold_vals:
        rf_model = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=42, bootstrap=False)
        rf_model.fit(X_train, y_train)

        class_probabilities_test = rf_model.predict_proba(X_test)
        class_probabilites_train = rf_model.predict_proba(X_train)
        
        # make predictions based on threshold
        y_pred_rf_test_threshold = (class_probabilities_test[:, 1] >= threshold).astype(int)
        y_pred_rf_train_threshold = (class_probabilites_train[:, 1] >= threshold).astype(int)

        # calculating precision of testing/training sets
        test_precision = precision_score(y_test, y_pred_rf_test_threshold)
        train_precision = precision_score(y_train, y_pred_rf_train_threshold)

        # calculating recall
        test_recall = recall_score(y_test, y_pred_rf_test_threshold)
        train_recall = recall_score(y_train, y_pred_rf_train_threshold)

        train_precisions_values.append(train_precision)
        test_precisions_values.append(test_precision)
        difference_values.append(difference)

        writer.writerow([threshold, train_precision, test_precision, train_recall, test_recall])

# reading in results
results = pd.read_csv('results/threshold_results.csv')

# plotting
fig, axs = plt.subplots(2,1, figsize=(10, 8))
sns.set(style='darkgrid')
sns.set_palette("colorblind")

# line plot of precision vs. max_depth (training and testing)
axs[0].plot(results['Confidence Threshold'], results['Training Precision'], label='Training Precision', marker='x')
axs[0].plot(results['Confidence Threshold'], results['Testing Precision'], label='Testing Precision', marker='x')
axs[0].set_xlabel('confidence threshold')
axs[0].set_xticks(results['Confidence Threshold'])
axs[0].set_xticklabels(results['Confidence Threshold'])
axs[0].set_ylabel('Precision')
axs[0].legend()
axs[0].set_title('Training and Testing Precision vs. Confidence Threshold', fontstyle='italic', fontsize=14)

# line plot of recall vs. max_depth (training and testing)
axs[1].plot(results['Confidence Threshold'], results['Training Recall'], label='Training Recall', marker='x')
axs[1].plot(results['Confidence Threshold'], results['Testing Recall'], label='Testing Recall', marker='x')
axs[1].set_xlabel('confidence threshold')
axs[1].set_xticks(results['Confidence Threshold'])
axs[1].set_xticklabels(results['Confidence Threshold'])
axs[1].set_ylabel('Recall')
axs[1].legend()
axs[1].set_title('Training and Testing Recall vs. Confidence Threshold', fontstyle='italic', fontsize=14)

fig.tight_layout()

In [None]:
# exploring max_depth parameter

# List of max_depth values to try
max_depth_values = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]

train_precisions_values = []
test_precisions_values = []
difference_values = []

directory = './results'
file_path = os.path.join(directory, 'max_depth_threshold.csv')

# Open the file and write the results
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Rest of the code...
    writer = csv.writer(file)
    
    # adding header
    writer.writerow(['Max Depth', 'Training Precision', 'Testing Precision', 'Difference'])

    for max_depth in max_depth_values:
        rf_model = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=42, bootstrap=False)
        rf_model.fit(X_train, y_train)

        class_probabilities_test = rf_model.predict_proba(X_test)
        class_probabilites_train = rf_model.predict_proba(X_train)

        threshold = 0.8
        
        # make predictions based on threshold
        y_pred_rf_test_threshold = (class_probabilities_test[:, 1] >= threshold).astype(int)
        y_pred_rf_train_threshold = (class_probabilites_train[:, 1] >= threshold).astype(int)

        # calculating precision of testing/training sets
        test_precision = precision_score(y_test, y_pred_rf_test_threshold)
        train_precision = precision_score(y_train, y_pred_rf_train_threshold)
        difference = abs(train_precision-test_precision)

        train_precisions_values.append(train_precision)
        test_precisions_values.append(test_precision)
        difference_values.append(difference)

        writer.writerow([max_depth, train_precision, test_precision, difference])


# reading in results
results = pd.read_csv('results/max_depth_threshold.csv')

# plotting
fig, axs = plt.subplots(2,1, figsize=(10, 8))
sns.set(style='darkgrid')
sns.set_palette("colorblind")

# line plot of precision vs. max_depth (training and testing)
axs[0].plot(results['Max Depth'], results['Training Precision'], label='Training Precision', marker='x')
axs[0].plot(results['Max Depth'], results['Testing Precision'], label='Testing Precision', marker='x')
axs[0].set_xlabel('max_depth')
axs[0].set_xticks(results['Max Depth'])
axs[0].set_xticklabels(results['Max Depth'].astype(int))
axs[0].set_ylabel('Precision')
axs[0].legend()
axs[0].set_title('Training and Testing Precision vs. max_depth', fontstyle='italic', fontsize=14)

# bar chart of difference between training and testing precision
axs[1].bar(results['Max Depth'], results['Difference'])

# Add data labels to the bar chart
for i, v in zip(results['Max Depth'], results['Difference']):
    axs[1].text(i, v, f"{v:.2f}", fontsize=8, ha='center', va='bottom')

axs[1].set_xlabel('max_depth')
axs[1].set_xticks(results['Max Depth'])
axs[1].set_xticklabels(results['Max Depth'].astype(int))

axs[1].set_ylabel('Difference')
axs[1].set_title('Difference between Training and Testing Precision vs. max_depth', fontstyle='italic', fontsize=14)


# adding confidence threshold to plot
text_box = f'Confidence Threshold: {threshold}'

axs[0].text(0.75, 0.65, text_box, transform=axs[0].transAxes,
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))


fig.tight_layout()

#### Model

In [None]:
# CLASS PROBABILITIES MODEL
wandb.init(project="Random Forest 2.0")

rf_model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42, 
                                  bootstrap=False, min_samples_split=5)

rf_model.fit(X_train, y_train)

# finding class probabilities for each sample
class_probabilities_test = rf_model.predict_proba(X_test)
class_probabilites_train = rf_model.predict_proba(X_train)

confidence_threshold = 0.8

# make predictions based on threshold
y_pred_rf_test_threshold = (class_probabilities_test[:, 1] >= confidence_threshold).astype(int)
y_pred_rf_train_threshold = (class_probabilites_train[:, 1] >= confidence_threshold).astype(int)


# Calculate scores
precision_train_threshold = precision_score(y_train, y_pred_rf_train_threshold)
precision_test_threshold = precision_score(y_test, y_pred_rf_test_threshold)
recall_train_threshold = recall_score(y_train, y_pred_rf_train_threshold)
recall_test_threshold = recall_score(y_test, y_pred_rf_test_threshold)
f1_train_threshold = f1_score(y_train, y_pred_rf_train_threshold)
f1_test_threshold = f1_score(y_test, y_pred_rf_test_threshold)

print(f"Precision on Training Set = {precision_train_threshold:.3f}")
print(f"Precision on Testing Set = {precision_test_threshold:.3f}")
print(f"Recall on Training Set = {recall_train_threshold:.3f}")
print(f"Recall on Testing Set = {recall_test_threshold:.3f}")
print(f"F1 Score on Training Set = {f1_train_threshold:.3f}")
print(f"F1 Score on Testing Set = {f1_test_threshold:.3f}")



wandb.log({"model_name":"Class probabilities model", "training_precision":precision_train_threshold, "testing_precision":precision_test_threshold, 
            "training_recall":recall_train_threshold, "testing_recall":recall_test_threshold, "training_f1":f1_train_threshold, "testing_f1":f1_test_threshold,
            "training date range": training_date_range, "testing date range": testing_date_range})

wandb.finish()

In [None]:
# Save the model
dump(rf_model, data_path/'rf_model.joblib')

### Optimising Model

#### Exploring max_depth

In [None]:
# exploring max_depth parameter

# List of max_depth values to try
max_depth_values = [1, 3, 5, 6, 7, 8, 9, 10, 12, 15, 20]

train_precisions_values = []
test_precisions_values = []
difference_values = []

directory = './results'
file_path = os.path.join(directory, 'max_depth_results.csv')

# creating csv file to store results - allows for easy plotting/analysis
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # adding header
    writer.writerow(['Max Depth', 'Training Precision', 'Testing Precision', 'Difference'])

    for max_depth in max_depth_values:
        rf_model = RandomForestClassifier(n_estimators=100, max_depth=max_depth, bootstrap=False, min_samples_split=5)
        rf_model.fit(X_train, y_train)

        # making predictions on test/train sets
        y_pred_test = rf_model.predict(X_test)
        y_pred_train = rf_model.predict(X_train)

        # calculating precision of testing/training sets
        test_precision = precision_score(y_test, y_pred_test)
        train_precision = precision_score(y_train, y_pred_train)
        difference = abs(train_precision-test_precision)

        train_precisions_values.append(train_precision)
        test_precisions_values.append(test_precision)
        difference_values.append(difference)

        writer.writerow([max_depth, train_precision, test_precision, difference])

In [None]:
# reading in results
results = pd.read_csv('results/max_depth_results.csv')

# plotting
fig, axs = plt.subplots(2,1, figsize=(10, 8))
sns.set(style='darkgrid')
sns.set_palette("colorblind")

# line plot of precision vs. max_depth (training and testing)
axs[0].plot(results['Max Depth'], results['Training Precision'], label='Training Precision', marker='x')
axs[0].plot(results['Max Depth'], results['Testing Precision'], label='Testing Precision', marker='x')
axs[0].set_xlabel('max_depth')
axs[0].set_xticks(results['Max Depth'])
axs[0].set_xticklabels(results['Max Depth'].astype(int))
axs[0].set_ylabel('Precision')
axs[0].legend()
axs[0].set_title('Training and Testing Precision vs. max_depth', fontstyle='italic', fontsize=14)

# bar chart of difference between training and testing precision
axs[1].bar(results['Max Depth'], results['Difference'])

# Add data labels to the bar chart
for i, v in zip(results['Max Depth'], results['Difference']):
    axs[1].text(i, v, f"{v:.2f}", fontsize=8, ha='center', va='bottom')

axs[1].set_xlabel('max_depth')
axs[1].set_xticks(results['Max Depth'])
axs[1].set_xticklabels(results['Max Depth'].astype(int))

axs[1].set_ylabel('Difference')
axs[1].set_title('Difference between Training and Testing Precision vs. max_depth', fontstyle='italic', fontsize=14)

fig.tight_layout()

#### Grid Search for Hyperparameter Tuning

In [None]:
rf_classifier = RandomForestClassifier()
params = rf_classifier.get_params()

print(params)

In [None]:
model = RandomForestClassifier(random_state=42)

# Define the hyperparameters to search over
'''
INITIAL GRID
param_grid = {'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']}
'''
param_grid = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap' : [True, False]
}


# Initialize the grid search with 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=5, scoring='f1')

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best recall score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

#### Exploring Optimised Parameters

In [None]:
wandb.init(project="Random Forest 2.0")

rf_model = RandomForestClassifier(random_state=42,
                                  max_depth=20,
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  n_estimators=200,
                                  bootstrap=False)


rf_model.fit(X_train, y_train)

# making predictions
y_pred_rf_test = rf_model.predict(X_test)
y_pred_rf_train = rf_model.predict(X_train)

# calculating performance metrics
precision_test = precision_score(y_test, y_pred_rf_test)
precision_train = precision_score(y_train, y_pred_rf_train)
recall_test = recall_score(y_test, y_pred_rf_test)
recall_train = recall_score(y_train, y_pred_rf_train)
f1_test = f1_score(y_test, y_pred_rf_test)
f1_train = f1_score(y_train, y_pred_rf_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Testing Set = {precision_test:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Testing Set = {recall_test:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Testing Set = {f1_test:.3f}")


wandb.log({"model_name":"Optimising hyperparameters", "training_precision":precision_train, "testing_precision":precision_test, 
            "training_recall":recall_train, "testing_recall":recall_test, "training_f1":f1_train, "testing_f1":f1_test,
            "training date range": training_date_range, "testing date range": testing_date_range})

wandb.finish()

In [None]:
# Save the model
dump(rf_model, '/Users/kirst/Downloads/rf_model.joblib')