# Model Selection

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from config.features import CAT_FEATURES

# sklearn utilities
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold

from imblearn.pipeline import Pipeline as ImbalancePipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.ensemble import RUSBoostClassifier

from lightgbm import LGBMClassifier

# sklearn models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = pd.read_parquet('../data/train.parquet')
sample_train = train.groupby('isFraud', group_keys=False)[train.columns].apply(lambda x: x.sample(frac=0.1, random_state=42))

print(train.shape)
print(sample_train.shape)

In [None]:
X = sample_train.drop(columns=['isFraud'])
y = sample_train['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)

print(X_train.shape)
print(y_train.shape)

In [None]:
del train, sample_train

### Pre-processing

In [None]:
X_train = X_train.replace([np.inf, -np.inf], -999)
X_test = X_test.replace([np.inf, -np.inf], -999)

In [None]:
X_train[CAT_FEATURES] = X_train[CAT_FEATURES].astype(str)
X_test[CAT_FEATURES] = X_test[CAT_FEATURES].astype(str)

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, [col for col in X_train.columns if col not in CAT_FEATURES]),
        ('cat', categorical_transformer, CAT_FEATURES)
    ]
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [None]:
del X_train, X_test

#### Model Selection

In [None]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_transformed, y_train)

y_pred = decision_tree.predict(X_test_transformed)

In [None]:
train_score = decision_tree.score(X_train_transformed, y_train)
test_score = decision_tree.score(X_test_transformed, y_test)

print("Train score: {}".format(train_score))
print("Test score: {}".format(test_score))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

In [None]:
RocCurveDisplay.from_estimator(decision_tree, X_test_transformed, y_test)

#### Cross Validation

In [None]:
np.random.seed(42)

results = {}

k_folds = 5
cv = StratifiedKFold(n_splits=k_folds, shuffle=True)

models = {
    'LGBM': LGBMClassifier(verbose=-1),
    'RF': RandomForestClassifier(),
    'BC': BaggingClassifier(),
    'DT': DecisionTreeClassifier(),
}

for name, model in models.items():
    scores = cross_val_score(model, X_train_transformed, y_train, cv=cv, scoring='roc_auc')
    results[name] = scores

    print(f'{name}: {scores.mean()} ({scores.std()})')

In [None]:
results

In [None]:
fig = plt.figure() 
fig.suptitle('ROC AUC') 
ax = fig.add_subplot(111) 
plt.boxplot(results.values()) 
ax.set_xticklabels(results.keys()) 
plt.show()

#### Resampling Methods

In [None]:
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)

X_train_res, y_train_res = undersampler.fit_resample(X_train_transformed, y_train)

scores = cross_val_score(RandomForestClassifier(random_state=42), X_train_res, y_train_res, cv=cv, scoring='roc_auc')
print(f'UNDER_RF: {scores.mean()} ({scores.std()})')

results['UNDER_RF'] = scores

In [None]:
rus_boost = RUSBoostClassifier(random_state=42)

scores = cross_val_score(rus_boost, X_train_transformed, y_train, cv=cv, scoring='roc_auc')
print(f'RUSB: {scores.mean()} ({scores.std()})')

results['RUSB'] = scores

In [None]:
smote = SMOTE(random_state=42)

X_train_res, y_train_res = smote.fit_resample(X_train_transformed, y_train)

scores = cross_val_score(LGBMClassifier(random_state=42, verbose=-1), X_train_res, y_train_res, cv=cv, scoring='roc_auc')
print(f'SMOTE_LGBM: {scores.mean()} ({scores.std()})')

results['SMOTE_LGBM'] = scores

In [None]:
sorted_results = dict(sorted(results.items(), key=lambda x: np.median(x[1]), reverse=True))

fig = plt.figure() 
fig.suptitle('ROC AUC') 
ax = fig.add_subplot(111) 
plt.boxplot(sorted_results.values()) 
ax.set_xticklabels(sorted_results.keys(), rotation=45) 
plt.show()