In [1]:
import pandas as pd
from datetime import datetime
import os
pd.set_option('display.max_columns', None)
import numpy as np
import xgboost as xgb
#import matplotlib.pyplot as plt
#import seaborn as sns
import math

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.metrics import accuracy_score, precision_score, precision_recall_curve, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
from python_scripts.model_performance import generate_df_summary, predict_max_f1, save_model
# import feature data types
import json
with open('python_scripts/data_types.json') as f:
    data_types = json.load(f) 

import joblib

## Baseline Model

In [2]:
X_train = pd.read_parquet('data/X_train.parquet')
y_train = pd.read_csv('data/y_train.csv')
X_val = pd.read_parquet('data/X_val.parquet')
y_val = pd.read_csv('data/y_val.csv')

In [None]:
baseline_model = XGBClassifier(
    eval_metric='auc',
    random_state=0
)

baseline_model.fit(X_train, y_train)
y_pred = baseline_model.predict(X_val)

generate_df_summary(baseline_model, y_val, y_pred, 'baseline model')

In [None]:
y_pred, best_threshold = predict_max_f1(baseline_model, X_val, y_val)
generate_df_summary(baseline_model, y_val, y_pred, 'baseline model threshold optimised', threshold=best_threshold)

## Oversampled datasets

In [3]:
X_train_resampled_10 = pd.read_parquet('data/resampled/X_train_10.parquet')
y_train_resampled_10 = pd.read_parquet('data/resampled/y_train_10.parquet')

X_train_resampled_25 = pd.read_parquet('data/resampled/X_train_25.parquet')
y_train_resampled_25 = pd.read_parquet('data/resampled/y_train_25.parquet')

X_train_resampled_50 = pd.read_parquet('data/resampled/X_train_50.parquet')
y_train_resampled_50 = pd.read_parquet('data/resampled/y_train_50.parquet')

X_train_resampled_100 = pd.read_parquet('data/resampled/X_train_100.parquet')
y_train_resampled_100 = pd.read_parquet('data/resampled/y_train_100.parquet')

### 10:1 oversampling ratio

In [None]:
baseline_model_10 = XGBClassifier(
    eval_metric='auc',
    random_state=0
)

baseline_model_10.fit(X_train_resampled_10, y_train_resampled_10)
y_pred = baseline_model_10.predict(X_val)
generate_df_summary(baseline_model_10, y_val, y_pred, 'baseline model with 10:1 resampling')

In [None]:
y_pred, best_threshold = predict_max_f1(baseline_model_10, X_val, y_val)
generate_df_summary(baseline_model_10, y_val, y_pred, 'baseline model with 10:1 resampling threshold optimised', threshold=best_threshold)

### 4:1 oversampling ratio

In [None]:
baseline_model_25 = XGBClassifier(
    eval_metric='auc',
    random_state=0
)

baseline_model_25.fit(X_train_resampled_25, y_train_resampled_25)
y_pred = baseline_model_25.predict(X_val)
generate_df_summary(baseline_model_25, y_val, y_pred, 'baseline model with 4:1 resampling')

In [None]:
y_pred, best_threshold = predict_max_f1(baseline_model_25, X_val, y_val)
generate_df_summary(baseline_model_25, y_val, y_pred, 'baseline model with 4:1 resampling threshold optimised', threshold=best_threshold)

### 2:1 oversampling ratio

In [None]:
baseline_model_50 = XGBClassifier(
    eval_metric='auc',
    random_state=0
)

baseline_model_50.fit(X_train_resampled_50, y_train_resampled_50)
y_pred = baseline_model_50.predict(X_val)
generate_df_summary(baseline_model_50, y_val, y_pred, 'baseline model with 2:1 resampling')

In [None]:
y_pred, best_threshold = predict_max_f1(baseline_model_50, X_val, y_val)
generate_df_summary(baseline_model_50, y_val, y_pred, 'baseline model with 2:1 resampling threshold optimised', threshold=best_threshold)

### 1:1 oversampling ratio

In [None]:
baseline_model_100 = XGBClassifier(
    eval_metric='auc',
    random_state=0
)

baseline_model_100.fit(X_train_resampled_100, y_train_resampled_100)
y_pred = baseline_model_100.predict(X_val)
generate_df_summary(baseline_model_50, y_val, y_pred, 'baseline model with 1:1 resampling')

In [None]:
y_pred, best_threshold = predict_max_f1(baseline_model_100, X_val, y_val)
generate_df_summary(baseline_model_100, y_val, y_pred, 'baseline model with 1:1 resampling threshold optimised', threshold=best_threshold)

### Comparing baseline model performance

In [None]:
mp_df = pd.read_csv('output/model_performance.csv')

ratios = ['Original', '10:1', '4:1', '2:1', '1:1']
f1s = mp_df['F1-Score'][::2].values[:5]
f1s_optimised = mp_df['F1-Score'][1::2].values[:5]

bar_width = 0.35  
x = np.arange(len(ratios)) 

plt.figure(figsize=(8, 6))
plt.bar(x - bar_width/2, f1s, bar_width, label='threshold=0.5', color='blue')
plt.bar(x + bar_width/2, f1s_optimised, bar_width, label='f1 optimised', color='orange')
plt.xlabel('Resampling Ratio')
plt.ylabel('F1 Score')
plt.title('Baseline f1-scores on different resampling ratios.')
plt.xticks(ticks=x, labels=ratios) 
plt.legend(fontsize=8)
plt.show()


## Hyper parameter tuning

### Deep dive with more combinations and WITHOUT smote, using xgboost inbuild scale_pos_weight parameter

In [None]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='auc'
)

# inital param search
param_dist = {
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],  
    'max_depth': [3, 4, 5, 6, 8, 10], 
    'min_child_weight': [1, 3, 5, 7],
    'reg_lambda': [0.1, 1, 10, 100], 
    'reg_alpha': [0, 0.1, 1, 10],  
    'scale_pos_weight': [0, 5, 10, 25, 50],  
    'max_delta_step': [0, 1, 5, 10],  
    'gamma': [0, 0.1, 0.5, 1, 5],  
    'subsample': [0.5, 0.7, 0.8, 1.0],  
    'colsample_bytree': [0.5, 0.7, 0.9, 1.0],  

    'n_estimators': [1000],
    'early_stopping_rounds': [15],
}


# RandomizedSearchCV setup for parameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100, 
    scoring='average_precision',  
    cv=3,  
    verbose=0,  
    random_state=0
)

# Fit the model (train and tune hyperparameters)
random_search.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_val)

In [None]:
y_pred, best_threshold = predict_max_f1(best_model, X_val, y_val)
generate_df_summary(best_model, y_val, y_pred, 'random_search_99:1', threshold=best_threshold)
save_model(random_search, 'saved_models/random_search_99:1')

### 10:1 oversampling ratio

In [None]:
# Fit the model (train and tune hyperparameters)
random_search.fit(X_train_resampled_10, y_train_resampled_10, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_val)

In [None]:
y_pred, best_threshold = predict_max_f1(best_model, X_val, y_val)
generate_df_summary(best_model, y_val, y_pred, 'random_search_10:1', threshold=best_threshold)
save_model(random_search, "saved_models/random_search_10:1.pkl")

### 4:1 oversampling ratio

In [None]:
# Fit the model (train and tune hyperparameters)
random_search.fit(X_train_resampled_25, y_train_resampled_25, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_val)

In [None]:
y_pred, best_threshold = predict_max_f1(best_model, X_val, y_val)
generate_df_summary(best_model, y_val, y_pred, 'random_search_4:1', threshold=best_threshold)
save_model(random_search, "saved_models/random_search_4:1.pkl")

### 2:1 oversampling ratio

In [None]:
# Fit the model (train and tune hyperparameters)
random_search.fit(X_train_resampled_50, y_train_resampled_50, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_val)

In [None]:
y_pred, best_threshold = predict_max_f1(best_model, X_val, y_val)
generate_df_summary(best_model, y_val, y_pred, 'random_search_2:1', threshold=best_threshold)
save_model(random_search, "saved_models/random_search_2:1.pkl")

### 1:1 oversampling ratio

In [None]:
# Fit the model (train and tune hyperparameters)
random_search.fit(X_train_resampled_100, y_train_resampled_100, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_val)

In [None]:
y_pred, best_threshold = predict_max_f1(best_model, X_val, y_val)
generate_df_summary(best_model, y_val, y_pred, 'random_search_1:1', threshold=best_threshold)
save_model(random_search, "saved_models/random_search_1:1.pkl")

### Compare the performance of the best performing models from each random search

In [None]:
# do best threshold test on each of the winning models

### View impact of hyper parameters on model performance of winning model

In [None]:
best_model = 'REPLACE STRING HERE' # <-----------------
results_df = pd.read_csv(f'saved_models/{best_model}/results.csv')
to_analyse = ['param_subsample', 'param_scale_pos_weight',
       'param_reg_lambda', 'param_reg_alpha',
       'param_min_child_weight', 'param_max_depth', 'param_max_delta_step',
       'param_learning_rate', 'param_gamma', 'param_colsample_bytree']

# subplots
num_vars = len(to_analyse)
cols = 3 
rows = math.ceil(num_vars / cols)

fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 4))
axes = axes.flatten()

for i, param in enumerate(to_analyse):
    sns.regplot(x=results_df["rank_test_score"], 
                y=results_df[param], 
                order=2,  
                scatter_kws={'alpha': 0.5},  
                ax=axes[i])  # plot in respective subplot

    # correlation
    correlation = round(results_df[param].corr(results_df["rank_test_score"]), 2)

    axes[i].set_xlabel("Model Rank (Lower is Better)")
    axes[i].set_ylabel(param)
    axes[i].set_title(f"{param} vs. Model Rank (corr: {correlation})")

# remove spares subplts
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])
fig.suptitle("Relationship Between Hyperparameters and Model Rank", fontsize=25, y=1)
plt.tight_layout()
plt.show()


### Final random search just on the best weighting of oversampling with adjusted grid

In [None]:
# adjusted grid
# best oversampling ratio
# save best model as the winning model

### adjust winning model weights using validation set

In [None]:
# remove early stopping
# change number of boosting rounds
# save final hyperparameters in a yaml file

### fit model to the test set

In [None]:
# save final model as pkl ready to go