# Final Model

In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn utilities
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# sklearn models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# imblearn
from imblearn.pipeline import Pipeline as ImbalancePipeline
from imblearn.under_sampling import RandomUnderSampler

# project
from src.preprocess import preprocess_data

In [3]:
train = pd.read_parquet('../data/train.parquet')

## Random Forest

In [4]:
X = train.drop(columns=['isFraud'])
y = train['isFraud']

X_1, X_2, y_1, y_2 = train_test_split(X, y, test_size=0.5, stratify=y, random_state=42)

print(X_1.shape)
print(y_1.shape)

del train

(295270, 252)
(295270,)


In [5]:
X_1 = X_1.replace([np.inf, -np.inf], -999)
X_2 = X_2.replace([np.inf, -np.inf], -999)

In [6]:
X_1_transformed, X_2_transformed = preprocess_data(X_1, X_2)

In [7]:
del X_1, X_2

In [8]:
clf = RandomForestClassifier(random_state=42)
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)

X_1_res, y_1_res = undersampler.fit_resample(X_1_transformed, y_1)

clf.fit(X_1_res, y_1_res)

y_pred = clf.predict(X_2_transformed)

In [9]:
confusion_matrix(y_2, y_pred)

array([[246658,  38281],
       [  1866,   8465]])

In [10]:
print(f'Accuracy: {accuracy_score(y_2, y_pred)}')
print(f'Precision: {precision_score(y_2, y_pred)}')
print(f'Recall: {recall_score(y_2, y_pred)}')
print(f'F1 Score: {f1_score(y_2, y_pred)}')

Accuracy: 0.8640329190232668
Precision: 0.18108501262140075
Recall: 0.8193785693543704
F1 Score: 0.29661685092068607


In [12]:
print(f'ROC AUC: {roc_auc_score(y_2, y_pred)}')

ROC AUC: 0.8425152579556764
