# SMOTE Tutorial

In [95]:
import numpy as np 
import pandas as pd
import tensorflow as tf 
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

## Data Prep

In [96]:
wd = os.getcwd()
datafolder = os.path.join(wd, "data")
datapath = os.path.join(datafolder, "creditcard.csv")

df = pd.read_csv(datapath)

In [97]:
# Nomalize amount column
df["normAmount"] = StandardScaler().fit_transform(np.array(df["Amount"]).reshape(-1, 1))

# drop Time and Amount columns as they are not relevant for prediction purpose
df = df.drop(["Time", "Amount"], axis=1)
df["Class"].value_counts()


Class
0    284315
1       492
Name: count, dtype: int64

In [98]:
# split into 70:30 ration
X = df.drop(["Class"], axis=1)
y = df["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# describes info about train and test set
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (199364, 29)
Number transactions y_train dataset:  (199364,)
Number transactions X_test dataset:  (85443, 29)
Number transactions y_test dataset:  (85443,)


## Logistic Regression First Run

In [99]:
# logistic regression object
lr = LogisticRegression()

# train the model on train set
lr.fit(X_train, y_train.ravel())

predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

  lr.fit(X_train, y_train.ravel())


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.88      0.63      0.73       147

    accuracy                           1.00     85443
   macro avg       0.94      0.81      0.87     85443
weighted avg       1.00      1.00      1.00     85443

[[85284    12]
 [   55    92]]


## Using SMOTE Algorithm

In [100]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

Before OverSampling, counts of label '1': 345
Before OverSampling, counts of label '0': 199019 



In [None]:
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

  X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())


After OverSampling, the shape of train_X: (398038, 29)
After OverSampling, the shape of train_y: (398038,) 

After OverSampling, counts of label '1': 199019
After OverSampling, counts of label '0': 199019


### Logistic Regresion

In [102]:
lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res.ravel())
predictions = lr1.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85296
           1       0.06      0.92      0.11       147

    accuracy                           0.98     85443
   macro avg       0.53      0.95      0.55     85443
weighted avg       1.00      0.98      0.99     85443



In [103]:
confusion_matrix(y_test, predictions)

array([[83188,  2108],
       [   12,   135]])

### Random Forrest Classifier

In [104]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res.ravel())
preds = rf.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.92      0.82      0.86       147

    accuracy                           1.00     85443
   macro avg       0.96      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [105]:
confusion_matrix(y_test, preds)

array([[85285,    11],
       [   27,   120]])