In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, \
                            recall_score, confusion_matrix, f1_score


# Set figure size to (12, 6)
plt.rcParams['figure.figsize'] = (12,6)

# Class imbalance

- What is it?
- Why might we care about it?
- How can we deal with it?


## What is it?

In [None]:
# Import a creditcard dataset
df = pd.read_csv('creditcardfraud.zip', index_col=0)
df.head()

In [None]:
# check for null values

In [None]:
# inspect the class sizes

In [None]:
# calculate the percentage of observations belonging to class 1

## Why is it a Problem

- Because the classifier has an "incentive" to rather predict class 0. It just shows up so much more.

## How can we deal with it?

In [None]:
# Create a function that prints scores and a confusion matrix for a specified model
def print_evaluations(ytrue, ypred, model):
    '''
    Prints the confusion matrix and some evaluation metrics for 
    a specified model.
    '''
    print(f'How does model {model} score:')
    print(f'The accuracy of the model is: {round(accuracy_score(ytrue, ypred), 3)}')
    print(f'The precision of the model is: {round(precision_score(ytrue, ypred), 3)}')
    print(f'The recall of the model is: {round(recall_score(ytrue, ypred), 3)}')
    print(f'The f1-score of the model is: {round(f1_score(ytrue, ypred), 3)}')
    
    #print confusion matrix
    fig = plt.figure(figsize=(12, 12))
    cm = confusion_matrix(ytrue, ypred)
    print(cm)
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, fmt='.0f', ax= ax)
    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(['non-fraud', 'fraud'])
    ax.yaxis.set_ticklabels(['non-fraud', 'fraud'])

#### Split the data

In [None]:
X = df.iloc[:,:-1]
y = df.Class

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Build a simple baseline model

In [None]:
ypred_bl = [0] * X.shape[0]

In [None]:
print_evaluations(y, ypred_bl, 'Baseline')

#### Compare it to a random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=20, max_depth=3
                            , random_state=10)

In [None]:
# Fit the model on the training data
rf.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
ypred_rf = rf.predict(X_test)

In [None]:
# Inspect the evaluators
print_evaluations(y_test, ypred_rf, 'RandomForest')

### Use Undersampling

In [None]:
#!pip install imbalanced-learn

In [None]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss

In [None]:
# Instantiate both resamplers
rus = ...
nm = ...

In [None]:
# Resample according to both resamplers
X_rus, y_rus = rus.fit_resample(X_train, y_train)
X_nm, y_nm = nm.fit_resample(X_train, y_train)

In [None]:
X_rus.shape, y_rus.shape

In [None]:
# Fit the random undersampling model, 
# make predictions and inspect evaluations
rf.fit(X_rus, y_rus)
ypred_rus = rf.predict(X_test)
print_evaluations(y_test, ypred_rus, 'RandomUndersampling')

In [None]:
# Fit the Near Miss, 
# make predictions and inspect evaluations
rf.fit(X_nm, y_nm)
ypred_nm = rf.predict(X_test)
print_evaluations(y_test, ypred_nm, 'NearMiss')

### Use Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [None]:
# RandomOverSampler Model
ros = ...

In [None]:
X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [None]:
np.unique(y_ros, return_counts=True)

In [None]:
# Fit the RandomOverSampling, 
# make predictions and inspect evaluations
rf.fit(X_ros, y_ros)
ypred_ros = rf.predict(X_test)
print_evaluations(y_test, ypred_ros, 'RandomOversampling')

In [None]:
# Fit and run SMOTE
sm = SMOTE(sampling_strategy={1: 2000})
X_sm, y_sm = sm.fit_resample(X_train, y_train)

rf.fit(X_sm, y_sm)
ypred_sm = rf.predict(X_test)
print_evaluations(y_test, ypred_sm, 'SMOTE')