## xgboost Model (SMOTE + Tomek)

We will now see if any improvement can be made by changing the sampling strategy.

In [1]:
# data manipulation
import pandas as pd
import os
import numpy as np

# modeling
from xgboost import XGBClassifier

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as imbPipeline

# custom helper functions
from src.models import cross_validate as cv

In [2]:
DATA_PATH = '../data/processed/'
OBS_PATH = os.path.join(DATA_PATH, 'observations_features.csv')
RESULTS_PATH = os.path.join(DATA_PATH, 'results.csv')
model = 'xgb_SMOTE_Tomek'

### Load data

In [3]:
obs = pd.read_csv(OBS_PATH)
obs.head()

Unnamed: 0,session_id,seq,buy_event,visitor_id,view_count,session_length,item_views,add_to_cart_count,transaction_count,avg_avail
0,1000001_251341,2.0,0,1000001,1.0,0.0,1.0,0.0,0.0,0.0
1,1000007_251343,2.0,0,1000007,1.0,0.0,1.0,0.0,0.0,0.0
2,1000042_251344,2.0,0,1000042,1.0,0.0,1.0,0.0,0.0,1.0
3,1000057_251346,2.0,0,1000057,1.0,0.0,1.0,0.0,0.0,1.0
4,1000067_251351,2.0,0,1000067,1.0,0.0,1.0,0.0,0.0,0.0


### Perform Train/Test split

In [4]:
X_train, X_test, y_train, y_test = cv.create_Xy(obs)

print(f'Class balance: {y_train.mean():.2%}')

Class balance: 1.57%


### Modeling

In [5]:
pipe = imbPipeline([
    ('smote', SMOTETomek()),
    ('xgb', XGBClassifier(n_estimators=500))
])

cv_results = cv.cv_model(X_train, y_train, pipe)
cv.log_scores(cv_results, model)

Unnamed: 0,avg_accuracy,std_accuracy,avg_precision,std_precision,avg_recall,std_recall,avg_f1,std_f1,avg_auc,std_auc
xgb_SMOTE_Tomek,0.933256,0.007034,0.216817,0.025067,0.059006,0.003155,0.092508,0.004382,0.621756,0.015873


### Save the results

In [6]:
results = pd.read_csv(RESULTS_PATH, index_col=0)

results = results.drop(index=model, errors='ignore')
results = results.append(cv.log_scores(cv_results, model), sort=False)
results.to_csv(RESULTS_PATH)
results

Unnamed: 0,avg_accuracy,std_accuracy,avg_precision,std_precision,avg_recall,std_recall,avg_f1,std_f1,avg_auc,std_auc
log_regression,0.478189,0.003034,0.84024,0.015868,0.024764,0.000464,0.048111,0.0009,0.752486,0.009845
random_forest,0.930277,0.001674,0.148949,0.016731,0.039709,0.003086,0.062687,0.005337,0.531354,0.013361
xgb,0.936546,0.003746,0.211411,0.023783,0.061001,0.005206,0.094584,0.008094,0.619189,0.016401
log_reg_SMOTE_Tomek,0.482685,0.002645,0.833033,0.015731,0.024773,0.000539,0.048115,0.001043,0.750921,0.00865
xgb_SMOTE_Tomek,0.933256,0.007034,0.216817,0.025067,0.059006,0.003155,0.092508,0.004382,0.621756,0.015873


### Next Steps

Applying a SMOTE up-sampling plus Tomek link down-sampling had no real effect in the overall validation AUC for the xgboost classifier.

Now we will hyper-tune both the Logistic Regression model and the xgboost model.