In [21]:
import pandas as pd
import numpy as np

seed = 42

In [2]:
df = pd.read_csv("../data/creditcard.csv")
print(f"Rows: {df.shape[0]}")
print(f"Cols: {df.shape[1]}")
df.sample(5)

Rows: 284807
Cols: 31


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
228910,145715.0,-0.761321,1.942781,-0.049876,4.33192,0.984831,0.27556,1.061439,-1.044702,-2.339929,...,0.997077,0.708025,0.010032,0.09706,-0.27512,0.507611,0.117189,0.257298,36.37,0
66293,51976.0,-6.174063,4.035894,-0.665882,-0.839733,-1.083737,1.71543,-4.177927,-11.98248,1.075372,...,11.837265,-4.184667,2.399834,0.229601,0.245098,0.424564,0.801378,-0.186094,4.0,0
223197,143283.0,-0.919911,1.521083,-0.774145,-0.44993,0.815685,-0.780079,0.755754,0.235872,-0.038001,...,0.001561,0.371808,-0.367725,-0.68764,0.016328,0.624137,0.397003,0.220389,19.99,0
76515,56605.0,0.407557,-1.781199,0.539208,0.330971,-1.289673,0.703315,-0.421889,0.337033,1.120508,...,0.040663,-0.557176,-0.264683,-0.238067,-0.108295,0.938347,-0.102546,0.061343,391.56,0
54488,46472.0,1.25481,-0.750851,1.07349,-0.740894,-1.342331,-0.045361,-1.158098,0.179733,-0.741235,...,0.485317,1.352401,-0.074393,0.27962,0.307583,-0.052012,0.051451,0.016969,21.99,0


## Reducing the dataset

Timo mentioned that we do not have to work on the full dataset. Let's draw a sample from the normal cases while keeping all

In [None]:
# Separate fraudulent and normal transactions
df_fraud = df[df['Class'] == 1]
df_normal = df[df['Class'] == 0]

### Under-sampling

In [6]:
# Sample a sub set of the normal transactions
df_normal_sub = df_normal.sample(df_fraud.shape[0])

# Merge fraud cases and the normal cases subsample
df_under = pd.concat([df_fraud, df_normal_sub]).sample(frac=1, random_state=seed)

Number of transactions: 984


In [13]:
print("Distribution after Under-sampling:\n")
print(f"Number of transactions: {df_under.shape[0]}")
print(f"Normal: {round(df_under['Class'].value_counts(normalize = True)[0] * 100, 2)} %")
print(f"Fraudulent: {round(df_under['Class'].value_counts(normalize = True)[1] * 100, 2)} %")

Distribution after Under-sampling:

Number of transactions: 984
Normal: 50.0 %
Fraudulent: 50.0 %


### Over-sampling

In [14]:
from imblearn.over_sampling import SMOTE # ref: https://imbalanced-learn.readthedocs.io/

Using TensorFlow backend.


In [17]:
num_normal_transactions = 5000
df_normal_smote = df_normal.sample(num_normal_transactions)

In [23]:
df_smote_temp = pd.concat([df_fraud, df_normal_smote]).sample(frac=1, random_state=seed)
X_smote = df_smote_temp.drop('Class', axis=1)
y_smote = df_smote_temp['Class']

In [28]:
X_smote, y_smote = SMOTE(sampling_strategy='minority').fit_resample(X_smote, y_smote)

In [27]:
print("Distribution after SMOTE (Over-sampling):\n")
print(f"Number of transactions: {X_smote.shape[0]}")
print(f"Normal: {round(np.bincount(y_smote)[0]/y_smote.shape[0] * 100, 2)} %")
print(f"Fraud: {round(np.bincount(y_smote)[1]/y_smote.shape[0] * 100, 2)} %")

Distribution after SMOTE (Over-sampling):

Number of transactions: 10000
Normal: 50.0 %
Fraud: 50.0 %
