## Gradient Boosted Tree

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix
from joblib import dump
import numpy as np

#### Initialising

In [None]:
# initialising WandB
import wandb, os

os.environ["WANDB_API_KEY"] = "e84d2e19bd2cc42ec6e5d232cd0b6f0fe41f2189"
os.environ["WANDB_NOTEBOOK_NAME"] = "gbt_models.ipynb"

'''
Syntax for using WandB:

wandb.init(project="MSciProject", name="name", notebook="your-notebook-name")
# code here
wandb.finish()
'''

In [None]:
# Load the data
data = pd.read_csv('/Users/kirst/Downloads/uvf.csv', parse_dates=['time'])

assert len(data.columns) == 20

In [None]:
# Convert "time" column to datetime format
#data['time'] = pd.to_datetime(data['time'], format='%d/%m/%Y %H:%M')

# Split the data into training and testing sets based on the date
train_data = data[data['time'].dt.year <= 2012]
test_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2016)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}")

training_date_range = "2003-05-22 to 2012-12-31"
testing_date_range = "2014-01-01 to 2016-12-31"

# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Check the size of the training and testing sets
len(train_data), len(test_data)

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

# Balanced Data - removing NaN values and associated data
y_train = y_train.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_test = X_test.loc[y_test.index]

assert len(X_train.columns)==18

### Initial Model

In [None]:
wandb.init(project="MSciProject - GBT")

# Initialize Gradient boosted tree model
gbt_model = GradientBoostingClassifier(random_state=42, bootstrap=False, subsample=0.8, n_estimators=50,
                                       min_samples_split=2, min_samples_leaf=10, max_depth=5, learning_rate=0.01)

# Train the model
gbt_model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred_gbt = gbt_model.predict(X_test)

# Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred_gbt)
precision = precision_score(y_test, y_pred_gbt)
recall = recall_score(y_test, y_pred_gbt)
auc_roc = roc_auc_score(y_test, y_pred_gbt)
f1 = f1_score(y_test, y_pred_gbt)

#auc_pr
precision1, recall1, _ = precision_recall_curve(y_test, y_pred_gbt)
auc_pr = auc(recall1, precision1)

#specificity
threshold = 0.5
# Convert predicted probabilities to binary predictions using the threshold
y_pred_binary = (y_pred_gbt > threshold).astype(int)

conf_matrix = confusion_matrix(y_test, y_pred_binary)
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)

accuracy, precision, recall, auc_roc, f1, auc_pr, specificity

wandb.log({"model_name":"Gradient boosted tree", "accuracy":accuracy, "precision":precision, 
           "recall":recall, "auc_roc":auc_roc, "f1_score":f1, "auc_pr":auc_pr, "specificity":specificity,
           "training date range": training_date_range, "testing date range": testing_date_range})

wandb.finish()

### Optimising Hyperparameters

In [None]:
gbt_classifier = GradientBoostingClassifier()
params = gbt_classifier.get_params()

print(params)

In [None]:
# Hyperparameters grid for the randomized search
param_dist = {
    'n_estimators': np.arange(50, 301, 50),
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
    'max_depth': np.arange(3, 15, 1),
    'min_samples_split': np.arange(2, 11, 1),
    'min_samples_leaf': np.arange(1, 11, 1),
    'subsample': [0.7, 0.8, 0.9, 1]
}

# Initialize the randomized search with 5-fold cross-validation
random_search = RandomizedSearchCV(GradientBoostingClassifier(random_state=42),
                                   param_distributions=param_dist, n_iter=20, 
                                   scoring='recall', cv=5, n_jobs=-1, random_state=42)

# Perform the randomized search on the training data
random_search.fit(X_train, y_train)

# Get the best parameters and the best recall score
best_params_random = random_search.best_params_
best_recall_random = random_search.best_score_

best_params_random, best_recall_random

### Optimised Model

In [None]:
wandb.init(project="MSciProject")

# Run model with best parameters
# try this against gradient boosted tree and compare?

rf_model = RandomForestClassifier(random_state=42,
                                  max_depth=20,
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  n_estimators=200,
                                  bootstrap=False)


rf_model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

#auc_pr
precision1, recall1, _ = precision_recall_curve(y_test, y_pred)
auc_pr = auc(recall1, precision1)

#specificity
threshold = 0.5
# Convert predicted probabilities to binary predictions using the threshold
y_pred_binary = (y_pred > threshold).astype(int)

conf_matrix = confusion_matrix(y_test, y_pred_binary)
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)

accuracy, precision, recall, auc_roc, f1, auc_pr, specificity

wandb.log({"model_name":"Final Random Forest, bootstrap=False", "accuracy":accuracy, "precision":precision, 
           "recall":recall, "auc_roc":auc_roc, "f1_score":f1, "auc_pr":auc_pr, "specificity":specificity,
           "training date range": training_date_range, "testing date range": testing_date_range })

wandb.finish()

In [None]:
# Save the model
dump(rf_model, '/Users/kirst/Downloads/rf_model.joblib')