In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/card_transdata.csv")
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [5]:
counts = df['fraud'].value_counts()
fractions = df['fraud'].value_counts(normalize=True)

print("Counts:\n", counts)
print("\nFractions:\n", fractions)

Counts:
 fraud
0.0    912597
1.0     87403
Name: count, dtype: int64

Fractions:
 fraud
0.0    0.912597
1.0    0.087403
Name: proportion, dtype: float64


In [10]:
X = df.drop(columns=['fraud'])
y = df['fraud']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y)

In [12]:
continuous = ['distance_from_home',
              'distance_from_last_transaction',
              'ratio_to_median_purchase_price']

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[continuous] = scaler.fit_transform(X_train[continuous])
X_test_scaled[continuous] = scaler.transform(X_test[continuous])

In [13]:
log_reg = LogisticRegression(max_iter=100, class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)

In [16]:
y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    182519
         1.0       0.57      0.95      0.71     17481

    accuracy                           0.93    200000
   macro avg       0.78      0.94      0.84    200000
weighted avg       0.96      0.93      0.94    200000



In [26]:
ros = RandomOverSampler(random_state=0)
X_train_res, y_train_res = ros.fit_resample(X_train_scaled, y_train)

print("Before:", y_train.value_counts().to_dict())
print("After:", y_train_res.value_counts().to_dict())

Before: {0.0: 730078, 1.0: 69922}
After: {0.0: 730078, 1.0: 730078}


In [27]:
log_reg_ros = LogisticRegression(max_iter=100)
log_reg_ros.fit(X_train_res, y_train_res)

y_pred_ros = log_reg_ros.predict(X_test_scaled)
y_prob_ros = log_reg_ros.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred_ros))

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    182519
         1.0       0.57      0.95      0.71     17481

    accuracy                           0.93    200000
   macro avg       0.78      0.94      0.84    200000
weighted avg       0.96      0.93      0.94    200000



In [28]:
ros = RandomUnderSampler(random_state=0)
X_train_res, y_train_res = ros.fit_resample(X_train_scaled, y_train)

print("Before:", y_train.value_counts().to_dict())
print("After:", y_train_res.value_counts().to_dict())

Before: {0.0: 730078, 1.0: 69922}
After: {0.0: 69922, 1.0: 69922}


In [29]:
log_reg_rus = LogisticRegression(max_iter=100)
log_reg_rus.fit(X_train_res, y_train_res)

y_pred_rus = log_reg_rus.predict(X_test_scaled)
y_prob_rus = log_reg_rus.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred_rus))

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    182519
         1.0       0.57      0.95      0.71     17481

    accuracy                           0.93    200000
   macro avg       0.78      0.94      0.84    200000
weighted avg       0.96      0.93      0.94    200000



In [19]:
sm = SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

print("Before:", y_train.value_counts().to_dict())
print("After:",  y_train_res.value_counts().to_dict())

Before: {0.0: 730078, 1.0: 69922}
After: {0.0: 730078, 1.0: 730078}


In [21]:
log_reg_os = LogisticRegression(max_iter=100)
log_reg_os.fit(X_train_res, y_train_res)

y_pred_os = log_reg_os.predict(X_test_scaled)
y_prob_os = log_reg_os.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred_os))

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    182519
         1.0       0.57      0.95      0.71     17481

    accuracy                           0.93    200000
   macro avg       0.78      0.94      0.84    200000
weighted avg       0.96      0.93      0.94    200000



In [30]:
### doesn't seem like any of the over/undersampling methods improved findings. 
### seemed to be pretty similar findings as the simple logistic regression 