In [27]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import xgboost as xgb

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
# import feature data types
import json
with open('python_scripts/data_types.json') as f:
    data_types = json.load(f) 

### Generate validation set
- New test set is purely for evaluation purposes and will remain untouched
- New validation set will be used in hyperparameter tuning

In [34]:
X_test = pd.read_csv('data/X_test_encoded_transformed_scaled.csv')
y_test = pd.read_csv('data/y_test.csv')

# extract validation set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [37]:
y_test.fraud_bool.sum()

1086

In [38]:
y_val.fraud_bool.sum()

1113

### Baseline Model

In [32]:
X_train = pd.read_csv('data/X_train_encoded_transformed_scaled.csv')
y_train = pd.read_csv('data/y_train.csv')

In [39]:
# baseline model without SMOTE
baseline_model = XGBClassifier(
    eval_metric='logloss',
    random_state=0
)


baseline_model.fit(X_train, y_train)
y_pred = baseline_model.predict(X_test)

In [40]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.47
Recall: 0.03
F1-Score: 0.06


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,98873,41
True Class 1,1049,37


### Balancing dataset

In [16]:
# from imblearn.over_sampling import SMOTENC

In [17]:
# # Handling class imbalance
# # define nominal and continuous variables
# numericals = data_types['numerical_continuous_bounded']+data_types['numerical_continuous_unbounded']+data_types['numerical_discrete']+data_types['ordinal']
# to_drop = ['prev_address_months_count', 'bank_months_count'] # variables not used anymore
# numericals = [i for i in numericals if i not in to_drop]
# nominals = [i for i in X_train.columns if i not in numericals]

# # specify categorical feature indices
# categorical_columns_idx = [X_train.columns.get_loc(i) for i in nominals]

# # apply smote nc (to handle nominal and categorical variables)
# smote_nc = SMOTENC(categorical_features=categorical_columns_idx, sampling_strategy=1, random_state=0)
# X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)

# y_train_resampled.value_counts()

In [18]:
# # export balanced datasets

# import os
# data_folder = os.path.join(os.getcwd(), 'data')

# # X_train_resampled
# file_path = os.path.join(data_folder, 'X_train_resampled.csv')
# X_train_resampled.to_csv(file_path, index=False)

# # y_train_resampled
# file_path = os.path.join(data_folder, 'y_train_resampled.csv')
# y_train_resampled.to_csv(file_path, index=False)

In [11]:
X_train_resampled = pd.read_csv('data/X_train_resampled.csv')
y_train_resampled = pd.read_csv('data/y_train_resampled.csv')


In [41]:
# baseline model with SMOTE
baseline_model_smote = XGBClassifier(
    eval_metric='logloss',
    random_state=0
)


baseline_model_smote.fit(X_train_resampled, y_train_resampled)
y_pred = baseline_model_smote.predict(X_test)

In [42]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.19
Recall: 0.16
F1-Score: 0.17


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,98138,776
True Class 1,909,177


## Hyper parameter tuning

not used in tuning, will add in if there appears to be over fitting, these use samples of the whole training data in each base tree:
- subsample (0-1)
- colsample_bytree (0-1)



#### initial light weight run:

In [43]:
# Define the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss',
)

# Define the hyperparameters to search
param_dist = {
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [0, 0.1, 1, 10], #l2
    'n_estimators': [500],
    'early_stopping_rounds': [10]

}

# RandomizedSearchCV setup for parameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=3, 
    scoring='f1',  
    cv=3,  
    verbose=1,  
    random_state=0
)

# Fit the model (train and tune hyperparameters)
random_search.fit(X_train_resampled, y_train_resampled, 
                  eval_set=[(X_val, y_val)],)

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[0]	validation_0-logloss:0.66169
[1]	validation_0-logloss:0.63346
[2]	validation_0-logloss:0.60762
[3]	validation_0-logloss:0.58488
[4]	validation_0-logloss:0.56351
[5]	validation_0-logloss:0.54404
[6]	validation_0-logloss:0.52639
[7]	validation_0-logloss:0.50992
[8]	validation_0-logloss:0.49450
[9]	validation_0-logloss:0.47981
[10]	validation_0-logloss:0.46664
[11]	validation_0-logloss:0.45415
[12]	validation_0-logloss:0.44219
[13]	validation_0-logloss:0.43142
[14]	validation_0-logloss:0.42133
[15]	validation_0-logloss:0.41091
[16]	validation_0-logloss:0.40200
[17]	validation_0-logloss:0.39366
[18]	validation_0-logloss:0.38585
[19]	validation_0-logloss:0.37788
[20]	validation_0-logloss:0.36998
[21]	validation_0-logloss:0.36266
[22]	validation_0-logloss:0.35541
[23]	validation_0-logloss:0.34897
[24]	validation_0-logloss:0.34244
[25]	validation_0-logloss:0.33648
[26]	validation_0-logloss:0.33089
[27]	validation_0-logloss:0.32434

In [44]:
best_model

In [45]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.2
Recall: 0.16
F1-Score: 0.18


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,98214,700
True Class 1,912,174


In [46]:
results_df = pd.DataFrame(random_search.cv_results_)
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_lambda,param_n_estimators,param_min_child_weight,param_max_depth,param_learning_rate,param_early_stopping_rounds,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,107.897169,0.994795,5.439499,0.110374,0.0,500,3,7,0.05,10,"{'reg_lambda': 0, 'n_estimators': 500, 'min_ch...",0.978143,0.992826,0.992832,0.987933,0.006923,1
1,84.020977,4.148361,2.534475,0.248175,10.0,500,5,3,0.02,10,"{'reg_lambda': 10, 'n_estimators': 500, 'min_c...",0.934933,0.942212,0.942175,0.939773,0.003423,3
2,81.563593,2.114521,2.430142,0.069388,0.1,500,5,3,0.04,10,"{'reg_lambda': 0.1, 'n_estimators': 500, 'min_...",0.952637,0.963739,0.964739,0.960372,0.005484,2


- Deep dive with more combinations and WITHOUT smote, using xgboost inbuild scale_pos_weight parameter

In [70]:
# Define the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss',
)

# set scale_pos_weight values
scale_pos_weight_1 = (len(y_train) - y_train.fraud_bool.sum()) / y_train.fraud_bool.sum()
scale_pos_weight_2 = np.sqrt((len(y_train) - y_train.fraud_bool.sum()) / y_train.fraud_bool.sum())
scale_pos_weight_3 = np.mean([scale_pos_weight_1, scale_pos_weight_2])

# Define the hyperparameters to search
param_dist = {
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [0, 0.1, 1, 10, 100], #l2
    'reg_alpha': [0, 0.1, 1, 10, 100],  # l1
    'n_estimators': [1000],
    'early_stopping_rounds': [15],
    'scale_pos_weight': [scale_pos_weight_1], #only using 1 for now
    'max_delta_step': [0, 1, 5, 10],
    'gamma': [0, 0.1, 0.5, 1, 2]
}

# RandomizedSearchCV setup for parameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=35, 
    scoring='f1',  
    cv=3,  
    verbose=1,  
    random_state=0
)

# Fit the model (train and tune hyperparameters)
random_search.fit(X_train_resampled, y_train_resampled, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 35 candidates, totalling 105 fits
[0]	validation_0-logloss:2.06916
[1]	validation_0-logloss:2.07257
[2]	validation_0-logloss:2.07631
[3]	validation_0-logloss:2.07988
[4]	validation_0-logloss:2.08371
[5]	validation_0-logloss:2.08746
[6]	validation_0-logloss:2.09152
[7]	validation_0-logloss:2.09471
[8]	validation_0-logloss:2.09887
[9]	validation_0-logloss:2.10287
[10]	validation_0-logloss:2.10715
[11]	validation_0-logloss:2.11153
[12]	validation_0-logloss:2.11507
[13]	validation_0-logloss:2.11994
[14]	validation_0-logloss:2.12467
[0]	validation_0-logloss:2.07013
[1]	validation_0-logloss:2.07398
[2]	validation_0-logloss:2.07816
[3]	validation_0-logloss:2.08216
[4]	validation_0-logloss:2.08647
[5]	validation_0-logloss:2.09055
[6]	validation_0-logloss:2.09491
[7]	validation_0-logloss:2.09851
[8]	validation_0-logloss:2.10300
[9]	validation_0-logloss:2.10750
[10]	validation_0-logloss:2.11211
[11]	validation_0-logloss:2.11742
[12]	validation_0-logloss:2.12252
[13]	v

In [72]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.11
Recall: 0.37
F1-Score: 0.17


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,95566,3348
True Class 1,684,402


In [77]:
results_df = pd.DataFrame(random_search.cv_results_)
results_df.sort_values('rank_test_score', ascending=True).reset_index(drop=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scale_pos_weight,param_reg_lambda,param_reg_alpha,param_n_estimators,param_min_child_weight,param_max_depth,param_max_delta_step,param_learning_rate,param_gamma,param_early_stopping_rounds,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,276.341931,27.049635,15.071368,2.032681,89.600227,0.0,10.0,1000,3,9,5,0.1,0.5,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.982372,0.980092,0.979876,0.98078,0.001129079,1
1,236.956176,32.255012,13.612032,3.196598,89.600227,0.1,10.0,1000,5,9,0,0.09,1.0,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.980618,0.980342,0.980488,0.980483,0.0001126457,2
2,260.932875,4.705412,15.320077,0.754945,89.600227,10.0,10.0,1000,1,9,5,0.08,0.1,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.982159,0.975459,0.975763,0.977794,0.003089219,3
3,261.270557,6.260833,15.968639,0.322704,89.600227,10.0,10.0,1000,5,9,10,0.05,1.0,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.977747,0.962594,0.963516,0.967952,0.006936383,4
4,294.679395,5.124326,16.00456,0.610073,89.600227,100.0,1.0,1000,5,9,0,0.05,0.1,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.974194,0.958281,0.959115,0.963863,0.007312616,5
5,297.781051,8.985475,16.538848,0.428082,89.600227,0.0,0.0,1000,3,9,0,0.03,0.1,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.974272,0.95293,0.954054,0.960418,0.009806558,6
6,326.359968,8.76093,15.570857,1.16841,89.600227,1.0,100.0,1000,3,9,0,0.05,0.0,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.970731,0.951975,0.95272,0.958476,0.008671406,7
7,143.762105,9.618017,7.391023,0.518163,89.600227,0.0,100.0,1000,1,9,0,0.1,2.0,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.970814,0.946741,0.948255,0.95527,0.01100841,8
8,319.660765,8.36945,16.544316,0.602943,89.600227,100.0,0.1,1000,5,9,1,0.03,1.0,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.960153,0.938586,0.938823,0.945854,0.01011153,9
9,269.668597,8.500677,15.637895,0.778343,89.600227,0.0,1.0,1000,1,9,1,0.03,0.1,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.960821,0.936489,0.939352,0.945554,0.01085839,10
