In [None]:
pip install imblearn

In [None]:
import pandas as pd
import numpy as np

# Over sampling techniques
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Undersampling techniques
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks

# Under- and Oversampling can be combined
from imblearn.combine import SMOTETomek, SMOTEENN


import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (12,6)

In [None]:
# Class sizes of Class 1 and class 2
n_1 = 100 # size of class 1
n_0 = 20 # size of class 2

In [None]:
# Create the y values
y = [1] * n_1 + [0] * n_0

In [None]:
# np.random.uniform() gives us a random number betweent 0 and 1
np.random.uniform(size=n_1)

In [None]:
# Create X values (x1 and x2)
# x = values for the class 1 + values for the class 0
x1 = list(np.random.uniform(size=n_1)*10) + list(np.random.uniform(low=.7, size=n_0)*10)
x2 = list(np.random.uniform(size=n_1)*10) + list(np.random.uniform(low=.7, size=n_0)*10)

In [None]:
# Create a DataFrame out of the data
df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})
df.head()

In [None]:
# Create a function that takes over plotting for us
def plot_data(df, a=1):
    '''
    The function plot_data plots the plot.
    '''
    sns.scatterplot(x='x1', y='x2', data=df, hue='y',  alpha=a)
    plt.title('Plot datapoints in 2 dimensions', fontsize=15)
    plt.xlabel('x1-values', fontsize=12)
    plt.ylabel('x2-values', fontsize=12)
    plt.axis(xmin=-1, xmax=11, ymin=-1, ymax=11)
    plt.show()

In [None]:
plot_data(df)

## Random oversampling

In [None]:
#!pip install imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
X = df[['x1', 'x2']]
X.head()

In [None]:
# Create an Oversampler and fit_resample X and y
ros = RandomOverSampler(sampling_strategy={0: 50}) # sampling_strategy = {class: number_of_datapoints_after_resampling}

In [None]:
X_resample, y_resample = ros.fit_resample(X, y)

In [None]:
# Inspect the shape of the resampled data
X.shape, X_resample.shape

In [None]:
# Create a resampled Dataset
def create_resampled_dataset(X_resample, y_resample):
    '''Creates a resampled DataFrame from X_resample and y_resample'''
    X_resample = pd.DataFrame(X_resample, columns=['x1', 'x2'])
    df_resample = pd.concat([X_resample, pd.DataFrame(y_resample, columns=['y'])], axis=1)
    return df_resample

In [None]:
# Create the resampled DataFrame from RandomOversampling
ros_resample = create_resampled_dataset(X_resample, y_resample)
ros_resample.head()

In [None]:
ros_resample.shape

In [None]:
# Look at duplicates

In [None]:
plot_data(ros_resample)

In [None]:
plot_data(df)

It looks like there is no difference betweent our two DataFrames. The reason is that RandomOverSampling copies the original data points and therefore we do not see any new data points in visual inspection.

## SMOTE - Synthetic Minority Oversampling Technique

What it basically does, it creates datapoints that lie between two (or more) original datapoints from the minority class.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(sampling_strategy={0: 50})
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
smote = create_resampled_dataset(X_resampled, y_resampled)

In [None]:
plot_data(smote)

## Random Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
rus = RandomUnderSampler(sampling_strategy={1: 50})
X_resample, y_resample = rus.fit_resample(X, y)

In [None]:
rus_df = create_resampled_dataset(X_resample, y_resample)

In [None]:
plot_data(rus_df)

## Near Miss

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
nm = NearMiss(sampling_strategy={1: 50})
X_resampled_nm, y_resampled_nm = nm.fit_resample(X, y)
nm_df = create_resampled_dataset(X_resampled_nm, y_resampled_nm)
plot_data(nm_df)