In [93]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import xgboost as xgb

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
# import feature data types
import json
with open('python_scripts/data_types.json') as f:
    data_types = json.load(f) 

### Generate validation set
- New test set is purely for evaluation purposes and will remain untouched
- New validation set will be used in hyperparameter tuning

In [94]:
X_test = pd.read_csv('data/X_test_encoded_transformed_scaled.csv')
y_test = pd.read_csv('data/y_test.csv')

# extract validation set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [95]:
y_test.fraud_bool.sum()

1086

In [96]:
y_val.fraud_bool.sum()

1113

### Baseline Model

In [97]:
X_train = pd.read_csv('data/X_train_encoded_transformed_scaled.csv')
y_train = pd.read_csv('data/y_train.csv')

In [100]:
# baseline model on imbalanced training set
baseline_model = XGBClassifier(
    eval_metric='aucpr',
    random_state=0
)

baseline_model.fit(X_train, y_train)
y_pred = baseline_model.predict(X_test)

In [102]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.46
Recall: 0.02
F1-Score: 0.03


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,98893,21
True Class 1,1068,18


### Balancing dataset

In [103]:
from imblearn.over_sampling import SMOTENC

In [104]:
# Handling class imbalance
# define nominal and continuous variables
numericals = data_types['numerical_continuous_bounded']+data_types['numerical_continuous_unbounded']+data_types['numerical_discrete']+data_types['ordinal']
to_drop = ['prev_address_months_count', 'bank_months_count'] # variables not used anymore
numericals = [i for i in numericals if i not in to_drop]
nominals = [i for i in X_train.columns if i not in numericals]

# specify categorical feature indices
categorical_columns_idx = [X_train.columns.get_loc(i) for i in nominals]

# apply smote nc (to handle nominal and categorical variables)
smote_nc = SMOTENC(categorical_features=categorical_columns_idx, sampling_strategy=1, random_state=0)
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)

y_train_resampled.value_counts()

fraud_bool
0             791170
1             791170
Name: count, dtype: int64

In [105]:
# # export balanced datasets

# import os
# data_folder = os.path.join(os.getcwd(), 'data')

# # X_train_resampled
# file_path = os.path.join(data_folder, 'X_train_resampled.csv')
# X_train_resampled.to_csv(file_path, index=False)

# # y_train_resampled
# file_path = os.path.join(data_folder, 'y_train_resampled.csv')
# y_train_resampled.to_csv(file_path, index=False)

In [106]:
X_train_resampled = pd.read_csv('data/X_train_resampled.csv')
y_train_resampled = pd.read_csv('data/y_train_resampled.csv')


In [107]:
# baseline model with SMOTE
baseline_model_smote = XGBClassifier(
    eval_metric='aucpr',
    random_state=0
)


baseline_model_smote.fit(X_train_resampled, y_train_resampled)
y_pred = baseline_model_smote.predict(X_test)

In [108]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.24
Recall: 0.11
F1-Score: 0.15


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,98515,399
True Class 1,963,123


## Hyper parameter tuning

not used in tuning, will add in if there appears to be over fitting, these use samples of the whole training data in each base tree:
- subsample (0-1)
- colsample_bytree (0-1)



### Deep dive with more combinations and WITHOUT smote, using xgboost inbuild scale_pos_weight parameter

In [109]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='aucpr'
)

# set scale_pos_weight values
scale_pos_weight_1 = (len(y_train) - y_train.fraud_bool.sum()) / y_train.fraud_bool.sum()
scale_pos_weight_2 = np.sqrt((len(y_train) - y_train.fraud_bool.sum()) / y_train.fraud_bool.sum())
scale_pos_weight_3 = np.mean([scale_pos_weight_1, scale_pos_weight_2])

# Define the hyperparameters to search
param_dist = {
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [0, 0.1, 1, 10, 100], #l2
    'reg_alpha': [0, 0.1, 1, 10, 100],  # l1
    'n_estimators': [1000],
    'early_stopping_rounds': [15],
    'scale_pos_weight': [scale_pos_weight_1, scale_pos_weight_2],
    'max_delta_step': [0, 1, 5, 10],
    'gamma': [0, 0.1, 0.5, 1, 2]
}

# RandomizedSearchCV setup for parameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20, 
    scoring='average_precision',  
    cv=3,  
    verbose=0,  
    random_state=0
)

# Fit the model (train and tune hyperparameters)
random_search.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

[0]	validation_0-aucpr:0.04177
[1]	validation_0-aucpr:0.04915
[2]	validation_0-aucpr:0.05703
[3]	validation_0-aucpr:0.06233
[4]	validation_0-aucpr:0.06968
[5]	validation_0-aucpr:0.07510
[6]	validation_0-aucpr:0.07815
[7]	validation_0-aucpr:0.08179
[8]	validation_0-aucpr:0.08734
[9]	validation_0-aucpr:0.09158
[10]	validation_0-aucpr:0.09570
[11]	validation_0-aucpr:0.09598
[12]	validation_0-aucpr:0.09724
[13]	validation_0-aucpr:0.09956
[14]	validation_0-aucpr:0.10592
[15]	validation_0-aucpr:0.10586
[16]	validation_0-aucpr:0.11251
[17]	validation_0-aucpr:0.11415
[18]	validation_0-aucpr:0.11430
[19]	validation_0-aucpr:0.11564
[20]	validation_0-aucpr:0.11939
[21]	validation_0-aucpr:0.12094
[22]	validation_0-aucpr:0.12166
[23]	validation_0-aucpr:0.12221
[24]	validation_0-aucpr:0.12324
[25]	validation_0-aucpr:0.12691
[26]	validation_0-aucpr:0.12667
[27]	validation_0-aucpr:0.13025
[28]	validation_0-aucpr:0.13021
[29]	validation_0-aucpr:0.13139
[30]	validation_0-aucpr:0.13327
[31]	validation_0-

In [110]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.23
Recall: 0.23
F1-Score: 0.23


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,98082,832
True Class 1,832,254


In [111]:
results_df = pd.DataFrame(random_search.cv_results_)
results_df.sort_values('rank_test_score', ascending=True).reset_index(drop=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scale_pos_weight,param_reg_lambda,param_reg_alpha,param_n_estimators,param_min_child_weight,param_max_depth,param_max_delta_step,param_learning_rate,param_gamma,param_early_stopping_rounds,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,35.319085,1.136286,1.071332,0.37494,9.46574,0.1,0.1,1000,1,3,10,0.08,0.1,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.166116,0.156494,0.17189,0.164834,0.006351,1
1,38.667375,4.897349,1.295604,0.372342,9.46574,0.1,100.0,1000,3,5,10,0.09,0.0,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.165037,0.154493,0.173063,0.164197,0.007604,2
2,57.526711,2.200314,1.513616,0.134494,9.46574,0.0,1.0,1000,3,3,5,0.04,1.0,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.165766,0.155262,0.171104,0.164044,0.006581,3
3,36.250952,3.357679,1.23172,0.291929,9.46574,0.0,100.0,1000,5,5,0,0.1,0.1,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.165332,0.153536,0.170613,0.16316,0.007139,4
4,30.394335,2.738235,0.955776,0.102508,9.46574,100.0,10.0,1000,5,5,0,0.08,0.0,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.163906,0.153021,0.171097,0.162675,0.007431,5
5,131.753647,15.687769,5.54728,0.529109,9.46574,100.0,10.0,1000,5,7,5,0.01,2.0,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.162719,0.151444,0.163316,0.15916,0.005461,6
6,38.785959,5.570175,2.307246,0.219827,9.46574,0.1,100.0,1000,1,9,10,0.06,1.0,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.159657,0.150311,0.166142,0.158703,0.006498,7
7,25.071665,1.610441,0.773764,0.02379,89.600227,0.0,1.0,1000,3,5,10,0.07,2.0,15,"{'scale_pos_weight': 89.60022650056625, 'reg_l...",0.160336,0.149461,0.165637,0.158478,0.006733,8
8,22.349561,3.658636,0.892723,0.097549,9.46574,10.0,0.1,1000,3,7,0,0.09,0.1,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.159257,0.148649,0.164379,0.157428,0.00655,9
9,27.745151,4.180902,1.268923,0.423072,9.46574,0.1,100.0,1000,5,9,1,0.08,0.1,15,"{'scale_pos_weight': 9.465739617196654, 'reg_l...",0.159617,0.149006,0.163128,0.15725,0.006003,10


### Deep dive this time using SMOTE data sets

In [112]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='aucpr',
)

# Define the hyperparameters to search
param_dist = {
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [0, 0.1, 1, 10, 100], #l2
    'reg_alpha': [0, 0.1, 1, 10, 100],  # l1
    'n_estimators': [1000],
    'early_stopping_rounds': [15],
    # 'scale_pos_weight': [scale_pos_weight_1], #only using 1 for now
    'max_delta_step': [0, 1, 5, 10],
    'gamma': [0, 0.1, 0.5, 1, 2]
}

# RandomizedSearchCV setup for parameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20, 
    scoring='average_precision',  
    cv=3,  
    verbose=1,  
    random_state=0
)

# Fit the model (train and tune hyperparameters)
random_search.fit(X_train_resampled, y_train_resampled, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[0]	validation_0-aucpr:0.02836
[1]	validation_0-aucpr:0.02836
[2]	validation_0-aucpr:0.02836
[3]	validation_0-aucpr:0.02836
[4]	validation_0-aucpr:0.03258
[5]	validation_0-aucpr:0.03246
[6]	validation_0-aucpr:0.03946
[7]	validation_0-aucpr:0.03886
[8]	validation_0-aucpr:0.03974
[9]	validation_0-aucpr:0.03971
[10]	validation_0-aucpr:0.03906
[11]	validation_0-aucpr:0.03908
[12]	validation_0-aucpr:0.03956
[13]	validation_0-aucpr:0.03978
[14]	validation_0-aucpr:0.03983
[15]	validation_0-aucpr:0.04395
[16]	validation_0-aucpr:0.04417
[17]	validation_0-aucpr:0.04497
[18]	validation_0-aucpr:0.04560
[19]	validation_0-aucpr:0.04583
[20]	validation_0-aucpr:0.05069
[21]	validation_0-aucpr:0.05101
[22]	validation_0-aucpr:0.05169
[23]	validation_0-aucpr:0.05853
[24]	validation_0-aucpr:0.06599
[25]	validation_0-aucpr:0.06671
[26]	validation_0-aucpr:0.06683
[27]	validation_0-aucpr:0.06669
[28]	validation_0-aucpr:0.06782
[29]	validation_0-auc

In [113]:
results_df = pd.DataFrame(random_search.cv_results_)
results_df.sort_values('rank_test_score', ascending=True).reset_index(drop=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_lambda,param_reg_alpha,param_n_estimators,param_min_child_weight,param_max_depth,param_max_delta_step,param_learning_rate,param_gamma,param_early_stopping_rounds,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,34.384313,17.381509,1.941138,1.072042,0.1,10.0,1000,5,9,0,0.09,1.0,15,"{'reg_lambda': 0.1, 'reg_alpha': 10, 'n_estima...",0.99346,0.997428,0.999841,0.99691,0.002631,1
1,41.462564,12.946978,2.359107,0.729962,100.0,1.0,1000,5,9,0,0.05,0.1,15,"{'reg_lambda': 100, 'reg_alpha': 1, 'n_estimat...",0.992661,0.999079,0.99589,0.995877,0.00262,2
2,47.975195,0.466238,2.603479,0.129999,0.0,0.0,1000,3,9,0,0.03,0.1,15,"{'reg_lambda': 0, 'reg_alpha': 0, 'n_estimator...",0.993376,0.996694,0.997142,0.995737,0.00168,3
3,21.871221,0.588071,1.203492,0.047486,10.0,10.0,1000,1,9,5,0.08,0.1,15,"{'reg_lambda': 10, 'reg_alpha': 10, 'n_estimat...",0.992983,0.996061,0.996436,0.99516,0.001547,4
4,28.555899,4.436093,1.555751,0.1363,10.0,10.0,1000,5,9,10,0.05,1.0,15,"{'reg_lambda': 10, 'reg_alpha': 10, 'n_estimat...",0.992262,0.996886,0.991679,0.993609,0.002329,5
5,44.262491,1.940557,2.385758,0.141612,0.0,1.0,1000,1,9,1,0.03,0.1,15,"{'reg_lambda': 0, 'reg_alpha': 1, 'n_estimator...",0.991886,0.994618,0.993405,0.993303,0.001118,6
6,36.399009,4.395159,2.032476,0.088624,1.0,1.0,1000,5,7,10,0.03,1.0,15,"{'reg_lambda': 1, 'reg_alpha': 1, 'n_estimator...",0.990861,0.993255,0.99006,0.991392,0.001357,7
7,31.156033,4.98624,1.632459,0.147569,0.1,10.0,1000,5,7,5,0.04,2.0,15,"{'reg_lambda': 0.1, 'reg_alpha': 10, 'n_estima...",0.991796,0.994084,0.987985,0.991288,0.002516,8
8,20.796312,4.566016,0.959828,0.167948,100.0,0.1,1000,1,5,0,0.1,1.0,15,"{'reg_lambda': 100, 'reg_alpha': 0.1, 'n_estim...",0.98791,0.988343,0.996258,0.990837,0.003837,9
9,24.328856,1.656115,1.201423,0.104329,0.1,0.0,1000,5,5,10,0.06,2.0,15,"{'reg_lambda': 0.1, 'reg_alpha': 0, 'n_estimat...",0.986591,0.991435,0.989669,0.989232,0.002001,10


In [114]:
cm = confusion_matrix(y_test, y_pred,)
cm_df = pd.DataFrame(cm, index=['True Class 0', 'True Class 1'], columns=['Pred Class 0', 'Pred Class 1'])

precision = round(precision_score(y_test, y_pred),2)
recall = round(recall_score(y_test, y_pred), 2)
f1 = round(f1_score(y_test, y_pred), 2)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

cm_df

Precision: 0.11
Recall: 0.34
F1-Score: 0.16


Unnamed: 0,Pred Class 0,Pred Class 1
True Class 0,95814,3100
True Class 1,722,364


### modifications:
- Changing my `eval_metric` to 'aucpr' and my `scoring` to 'average_position'
- Using balanced dataset with SMOTE and experimenting with some small values of `scale_pos_weight`

In [None]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='aucpr'
)

# set scale_pos_weight values
scale_pos_weight_1 = (len(y_train) - y_train.fraud_bool.sum()) / y_train.fraud_bool.sum()
scale_pos_weight_2 = np.sqrt((len(y_train) - y_train.fraud_bool.sum()) / y_train.fraud_bool.sum())
scale_pos_weight_3 = np.mean([scale_pos_weight_1, scale_pos_weight_2])
scale_pos_weight_4 = scale_pos_weight_2 / 2

# Define the hyperparameters to search
param_dist = {
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [0, 0.1, 1, 10, 100], #l2
    'reg_alpha': [0, 0.1, 1, 10, 100],  # l1
    'n_estimators': [1000],
    'early_stopping_rounds': [15],
    'scale_pos_weight': [scale_pos_weight_2, scale_pos_weight_4, 1], # ~[9, 4.5, 1]
    'max_delta_step': [0, 1, 5, 10],
    'gamma': [0, 0.1, 0.5, 1, 2]
}

# RandomizedSearchCV setup for parameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=25, 
    scoring='average_precision',  
    cv=3,  
    verbose=0,  
    random_state=0
)

# Fit the model (train and tune hyperparameters)
random_search.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)])

# Get the best model and print the results
best_model = random_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))