In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.utils import class_weight

# Imbalanced Data Preprocessing

Strategies for balancing highly imbalanced datasets:
* Oversample
   - Oversample the minority class to balance the dataset
   - Can create synthetic data based on the minority class
* Undersample
   - Remove majority class data (not preferred)
* Weight Classes
   - Use class weights to make minority class data more prominent

Let's use the red wine dataset to start with to demonstrate a highly imbalanced data set with very few high and low quality wine ratings.

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')

In [None]:
df.quality.value_counts()

Set the features to use for our prediction

In [None]:
features = df[['volatile acidity', 'citric acid', 'sulphates', 'alcohol']]
#features = df.drop(columns='quality')

Set the target value for our prediction

In [None]:
target = df['quality']

Split the dataset into a training and test dataset.

In [None]:
xtrain, xtest, ytrain, ytrue = train_test_split(features, target)

Visualize the imbalanced nature of the training data set outcomes.

In [None]:
count = ytrue.value_counts()
count.plot.bar()
plt.ylabel('Number of records')
plt.xlabel('Target Class')
plt.show()

## Base Model - Imbalanced Data

Using a simple Decision Tree Classifier to demonstrate the changes in prediction quality based on using different techniques to deal with imbalanced data.

In [None]:
model = DecisionTreeClassifier()

model.fit(xtrain, ytrain)

y_pred = model.predict(xtest)

print(f'Accuracy Score: {metrics.accuracy_score(ytrue, y_pred)}')
print(f'Precision Score: {metrics.precision_score(ytrue, y_pred, average="macro")}')
print(f'Recall Score: {metrics.recall_score(ytrue, y_pred, average="macro")}')
print(f'F1 Score: {metrics.f1_score(ytrue, y_pred, average="macro")}')

## Oversampling

Using the Imbalanced-Learn module, which is built on top of scikit learn, there are a number of options for oversampling (and undersampling) your training data. The most basic is the `RandomOverSampler()` function, which has a couple of different options:
* `'auto'` (default: `'not majority'`)
* `'minority'`
* `'not majority'`
* `'not minority'`
* `'all'`

There are also a host of other possibilities to create synthetic data (e.g., SMOTE)

https://imbalanced-learn.org/stable/over_sampling.html#

In [None]:
ros = RandomOverSampler()

X_resampled, y_resampled = ros.fit_resample(xtrain, ytrain)

y_resampled.value_counts()

Let's look at the resampled data to confirm that we now have a balanced dataset.

In [None]:
count = y_resampled.value_counts()
count.plot.bar()
plt.ylabel('Number of records')
plt.xlabel('Target Class')
plt.show()

Now let's try our prediction with the oversampled data

In [None]:
model = DecisionTreeClassifier()

model.fit(X_resampled, y_resampled)

y_pred = model.predict(xtest)

print(f'Accuracy Score: {metrics.accuracy_score(ytrue, y_pred)}')
print(f'Precision Score: {metrics.precision_score(ytrue, y_pred, average="macro")}')
print(f'Recall Score: {metrics.recall_score(ytrue, y_pred, average="macro")}')
print(f'F1 Score: {metrics.f1_score(ytrue, y_pred, average="macro")}')

So from this, we were able to improve the accuracy, precision, and recall of our model!

## Weighting

Determining weights are a balance of different factors and partially affected by the size of the imbalance. Scikit Learn has a function to help compute weights to get balanced classes caleed `compute_class_weights` frim the `class_weight` portion of the module.

To get the balanced weights use:

`class_weights = ‘balanced’`

and the model automatically assigns the class weights inversely proportional to their respective frequencies.

If the classes are too imbalanced, you might find better success by assigning weights to each class using a dictionary.

In [None]:
classes = np.unique(ytrain)
cw = class_weight.compute_class_weight('balanced', classes=classes, y=ytrain)
weights = dict(zip(classes, cw))
print(weights)

Now let's use our Decision Tree Model with the class weights calculated above.

In [None]:
model = DecisionTreeClassifier(class_weight=weights)

model.fit(xtrain, ytrain)

y_pred = model.predict(xtest)

print(f'Accuracy Score: {metrics.accuracy_score(ytrue, y_pred)}')
print(f'Precision Score: {metrics.precision_score(ytrue, y_pred, average="macro")}')
print(f'Recall Score: {metrics.recall_score(ytrue, y_pred, average="macro")}')
print(f'F1 Score: {metrics.f1_score(ytrue, y_pred, average="macro")}')

So improved over our initial model, but not as much as the oversampled model in this case.

## Credit Card Fraud - Logistic Regression

In [None]:
# load the data set
data = pd.read_csv('creditcard.csv')

# normalise the amount column
data['normAmount'] = StandardScaler().fit_transform(np.array(data['Amount']).reshape(-1, 1))
 
# drop Time and Amount columns as they are not relevant for prediction purpose
data = data.drop(['Time', 'Amount'], axis = 1)

In [None]:
# as you can see there are 492 fraud transactions.
print(data['Class'].value_counts())

plt.figure(figsize=(8, 8))
plt.bar([0, 1], data['Class'].value_counts(), tick_label=['Not Fraud', 'Fraud'])
plt.text(0, 286000, data['Class'].value_counts()[0], ha='center', fontsize=16)
plt.text(1, 10000, data['Class'].value_counts()[1], ha='center', fontsize=16)
plt.show()

In [None]:
X = data.drop(columns=['Class'])
y = data['Class']

# split into 70:30 ration
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# describes info about train and test set
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

## Base Model - Imbalanced Data

In [None]:
# logistic regression object
lr = LogisticRegression()
 
# train the model on train set
lr.fit(X_train, y_train.ravel())
 
predictions = lr.predict(X_test)
 
# print classification report
print(metrics.classification_report(y_test, predictions))

So our prediction leaves a lot to be desired as we have a very low recall of the fraud cases.

Let's try our hand at creating some synthetic data for resampling the minority class using SMOTE (Synthetic Minority Oversampling Technique)

In [None]:
sm = SMOTE(sampling_strategy='minority', random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res)
predictions = lr1.predict(X_test)
 
# print classification report
print(metrics.classification_report(y_test, predictions))

Our model's recall of fraud cases has improved greatly from our original model and our non-fraud recall has not suffered much at all.

We can also use a different threshold for predicting the fraud case. Instead of the standard >0.5 threshold, we could set 0.6 or 0.7 to improve the precision without harming the recall too much.

In [None]:
predictions = (lr1.predict_proba(X_test)[:,1]>=0.7).astype(int)
 
# print classification report
print(metrics.classification_report(y_test, predictions))