# Gradient Boosted Tree

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix
from joblib import dump
import numpy as np
from pathlib import Path

data_path = Path.home()/'OneDrive'/'Kirstin'/'Uni'/'Year4'/'MSciProject'/'data_files'/'saved_files'

### Initialising

In [7]:
# initialising WandB
import wandb, os

os.environ["WANDB_API_KEY"] = "e84d2e19bd2cc42ec6e5d232cd0b6f0fe41f2189"
os.environ["WANDB_NOTEBOOK_NAME"] = "gbt_models.ipynb"


# Syntax for using WandB:

# wandb.init(project="MSciProject", name="name", notebook="your-notebook-name")
# code here
# wandb.finish()

In [4]:
# Load the data
data = pd.read_csv(data_path/'for_model.csv', parse_dates=['time'])

data.sample(5)

Unnamed: 0,time,flag,u10_0,u10_1,u10_2,u10_3,u10_4,u10_5,u10_6,u10_7,...,v500_2_past,v500_3_past,v500_4_past,v500_5_past,v500_6_past,v500_7_past,v500_8_past,v500_13_past,v500_14_past,v500_15_past
40431,2012-03-24 23:07:00,0.0,-3.86414,-3.476183,-2.101937,-3.815259,-5.041623,-1.502368,0.217141,-0.826692,...,5.46829,1.952009,2.529173,15.111033,-2.246512,8.620608,10.601848,12.580041,5.421081,27.003036
63351,2019-01-04 15:31:00,0.0,-1.973831,3.367951,3.906042,1.590331,-6.07883,-6.78576,-4.714386,-3.498192,...,0.999656,8.354942,2.649332,6.596713,4.749667,5.585197,10.72666,-8.023397,12.220085,18.572079
54809,2016-07-19 20:57:00,0.0,2.642882,-4.230156,-2.129295,0.984587,2.490585,4.346284,5.13802,6.303338,...,8.515758,13.82387,15.025303,21.168,12.097638,22.203331,15.527683,21.698912,-9.775781,4.823821
10613,2003-12-12 03:25:00,0.0,-6.433543,-4.223824,-2.831567,-2.368298,4.978434,7.335835,5.974217,7.604237,...,-10.830469,-5.739624,-4.144861,-1.000995,10.095483,10.162339,10.75426,-2.263107,12.968663,14.535705
72,1998-01-15 18:12:00,0.0,13.008478,1.677077,1.304607,9.146038,9.406033,11.490072,13.821066,11.081741,...,6.771668,15.63261,4.758188,5.296753,6.562135,7.843885,1.251789,-3.636217,9.341717,-5.72827


In [8]:
# Split the data into training and testing sets based on the date
train_data = data[(data['time'].dt.year >= 2017) & (data['time'].dt.year <= 2019)]
test_data = data[(data['time'].dt.year >= 2020) & (data['time'].dt.year <= 2022)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}. Length: {len(train_data)}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}. Length: {len(test_data)}")

# saving the date ranges for WandB tracking
training_date_range = "2017-01-01 to 2019-12-31"
testing_date_range = "2020-01-01 to 2022-12-31"

# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

# Balanced Data - removing NaN values and associated data
y_train = y_train.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_test = X_test.loc[y_test.index]

Train range: 2017-01-01 00:01:00 -> 2019-12-31 22:31:00. Length: 9942
Test range: 2020-01-01 00:41:00 -> 2022-12-31 21:57:00. Length: 9561


### Models

#### Default Hyperparamters

In [9]:
gbt_model = GradientBoostingClassifier(random_state=42)

# train the model & make predictions
gbt_model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred_gbt_test = gbt_model.predict(X_test)
y_pred_gbt_train = gbt_model.predict(X_train)


# performance metrics
precision_test = precision_score(y_test, y_pred_gbt_test)
precision_train = precision_score(y_train, y_pred_gbt_train)

recall_test = recall_score(y_test, y_pred_gbt_test)
recall_train = recall_score(y_train, y_pred_gbt_train)

f1_test = f1_score(y_test, y_pred_gbt_test)
f1_train = f1_score(y_train, y_pred_gbt_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Testing Set = {precision_test:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Testing Set = {recall_test:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Testing Set = {f1_test:.3f}")

Precision on Training Set = 0.899
Precision on Testing Set = 0.706
Recall on Training Set = 0.627
Recall on Testing Set = 0.380
F1 Score on Training Set = 0.739
F1 Score on Testing Set = 0.494


#### Grid Search for Hyperparameter Tuning

In [6]:
gbt_classifier = GradientBoostingClassifier()
params = gbt_classifier.get_params()

# printing default parameters
print(params)

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [None]:
model = GradientBoostingClassifier(random_state=42)

# Hyperparameters grid for the randomized search
param_dist = {
    'n_estimators': np.arange(50, 301, 50),
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
    'max_depth': np.arange(3, 15, 1),
    'min_samples_split': np.arange(2, 11, 1),
    'min_samples_leaf': np.arange(1, 11, 1),
    'subsample': [0.7, 0.8, 0.9, 1]
}

# Initialize the randomized search with 5-fold cross-validation
grid_search = GridSearchCV(model, param_dist, n_jobs=-1, scoring='f1', cv=5)

# Perform the randomized search on the training data
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_f1 = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best F1 score: {best_f1:.3f}")

#### Exploring Optimised Hyperparameters

In [None]:
wandb.init(project="GBT")

gbt_model = GradientBoostingClassifier()

gbt_model.fit(X_train, y_train)

# Predictions
y_pred_gbt_test = gbt_model.predict(X_test)
y_pred_gbt_train = gbt_model.predict(X_train)

# calculating scores
precision_test = precision_score(y_test, y_pred_gbt_test)
precision_train = precision_score(y_train, y_pred_gbt_train)
recall_test = recall_score(y_test, y_pred_gbt_test)
recall_train = recall_score(y_train, y_pred_gbt_train)
f1_test = f1_score(y_test, y_pred_gbt_test)
f1_train = f1_score(y_train, y_pred_gbt_train)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Testing Set = {precision_test:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Testing Set = {recall_test:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Testing Set = {f1_test:.3f}")

wandb.log({"model_name":"Neural Network", "training_precision":precision_train, "testing_precision":precision_test, 
           "training_recall":recall_train, "testing_recall":recall_test, "training_f1":f1_train, "testing_f1":f1_test,
           "training date range": training_date_range, "testing date range": testing_date_range})

wandb.finish()

In [None]:
# saving model for analysis
dump(gbt_model, data_path/'gbt_model.joblib')