# Final Model

In [9]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn utilities
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

# sklearn models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# imblearn
from imblearn.pipeline import Pipeline as ImbalancePipeline
from imblearn.over_sampling import SMOTE

# lightgbm
from lightgbm import LGBMClassifier

# project
from src.preprocess import preprocess_data

In [11]:
train = pd.read_parquet('../data/train.parquet')
sample_train = train.groupby('isFraud', group_keys=False)[train.columns].apply(lambda x: x.sample(frac=0.1, random_state=42))

print(train.shape)
print(sample_train.shape)

(590540, 253)
(59054, 253)


## SMOTE + LightGBM

In [12]:
X = sample_train.drop(columns=['isFraud'])
y = sample_train['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

print(X_train.shape)
print(y_train.shape)

del train, sample_train

(41337, 252)
(41337,)


In [13]:
X_train = X_train.replace([np.inf, -np.inf], -999)
X_test = X_test.replace([np.inf, -np.inf], -999)

In [14]:
X_train_transformed, X_test_transformed = preprocess_data(X_train, X_test)

In [15]:
del X_train, X_test

In [16]:
clf = LGBMClassifier(random_state=42)
undersampler = SMOTE(sampling_strategy='minority', random_state=42)

X_train_res, y_train_res = undersampler.fit_resample(X_train_transformed, y_train)

clf.fit(X_train_res, y_train_res)

y_pred = clf.predict(X_test_transformed)

[LightGBM] [Info] Number of positive: 39891, number of negative: 39891
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64254
[LightGBM] [Info] Number of data points in the train set: 79782, number of used features: 252
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [17]:
confusion_matrix(y_test, y_pred)

array([[17053,    44],
       [  405,   215]])

In [18]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

Accuracy: 0.9746571089913643
Precision: 0.8301158301158301
Recall: 0.3467741935483871
F1 Score: 0.4891922639362912


In [19]:
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')

ROC AUC: 0.6721003213165109


## GridSearch

In [22]:
k_folds = 10

param_grid = {
    'num_leaves': [30, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [-1, 10, 20],
    'min_data_in_leaf': [20, 50, 100],
    'n_estimators': [100, 200, 500],
}

clf = LGBMClassifier(verbose=-1, n_jobs=4, random_state=42)
cv = StratifiedKFold(n_splits=k_folds, shuffle=True)

In [23]:
grid_search = GridSearchCV(clf, param_grid, cv=cv, scoring='roc_auc', return_train_score=False)
grid_search.fit(X_train_res, y_train_res)

print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

KeyboardInterrupt: 