In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
plt.rcParams["figure.titlesize"] = 18
plt.rcParams["axes.titlesize"] = 18

In [None]:
RANDOM_SEED = 666
np.random.seed(RANDOM_SEED)

## Train /Test Split for Imbalanced

Imagine we have a binary classification model, and 99% of the data belongs to the negative class, and 1% belongs to the positive class.

In [None]:
N = 1_000
y = np.zeros(N)
positive_indices = np.random.choice(np.arange(N), 10, replace=False)
y[positive_indices] = 1

In [None]:
print(y.mean())

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
print("Class Balance")
print("-------------")
for simulation in range(1, 11):
    train, test = train_test_split(y, test_size=0.20, random_state=simulation)
    train_class_balance = train.mean()
    test_class_balance = test.mean()
    print(
        f"(Simulation {simulation:2d}) "
        f"Train: {train_class_balance:.2%}"
        f", Test: {test_class_balance:.2%}"
    )

In [None]:
print("Stratified Class Balance")
print("------------------------")
for simulation in range(1, 11):
    train, test = train_test_split(
        y, test_size=0.20, random_state=simulation, stratify=y
    )
    train_class_balance = train.mean()
    test_class_balance = test.mean()
    print(
        f"(Simulation {simulation:2d}) "
        f"Train: {train_class_balance:.2%}"
        f", Test: {test_class_balance:.2%}"
    )

### Stratification Reduces Model Selection Error

In [None]:
from sklearn.datasets import make_classification

In [None]:
# Make a fake imbalanced classification dataset
X, y = make_classification(
    n_samples=10_000, n_features=50, n_classes=2, weights=[0.99, 0.01]
)

In [None]:
# We will eventually split into train, validation, and test.
# To start, we split off a test set.

X_pretrain, X_test, y_pretrain, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_SEED, stratify=y
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

Let's run a simulation. For each simulation, we randomly split into training and validation sets and train a Logistic Regression model on the training data. 

Ideally, the performance on our validation set should match the performance on our test set. We'll keep track of the validation and test $F_{1}$ scores for each simulation and then calculate the difference between them. We'll do this for both stratified and non-stratified sampling.

In [None]:
N_sim = 200

val_f1s = []
test_f1s = []
for sim in range(N_sim):
    X_train, X_val, y_train, y_val = train_test_split(
        X_pretrain, y_pretrain, test_size=0.20, random_state=sim
    )
    model = LogisticRegression(random_state=RANDOM_SEED, class_weight="balanced")
    model.fit(X_train, y_train)
    val_f1s.append(f1_score(y_val, model.predict(X_val)))
    test_f1s.append(f1_score(y_test, model.predict(X_test)))

no_stratify_diffs = np.array(test_f1s) - np.array(val_f1s)

val_f1s = []
test_f1s = []
for sim in range(N_sim):
    X_train, X_val, y_train, y_val = train_test_split(
        X_pretrain, y_pretrain, test_size=0.20, random_state=sim, stratify=y_pretrain
    )
    model = LogisticRegression(random_state=RANDOM_SEED, class_weight="balanced")
    model.fit(X_train, y_train)
    val_f1s.append(f1_score(y_val, model.predict(X_val)))
    test_f1s.append(f1_score(y_test, model.predict(X_test)))

stratify_diffs = np.array(test_f1s) - np.array(val_f1s)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
min_val = np.min((no_stratify_diffs, stratify_diffs))
max_val = np.max((no_stratify_diffs, stratify_diffs))
bins = np.linspace(min_val, max_val, 25)

ax.hist([no_stratify_diffs, stratify_diffs], bins=bins)

ax.legend(["Without Stratification", "With Stratification"])
ax.set_title(r"$F1_{test} - F1_{val}$ Histogram")
None

## Class Weight Shifts the Prediction Distribution

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_pretrain,
    y_pretrain,
    test_size=0.20,
    random_state=RANDOM_SEED,
    stratify=y_pretrain,
)

In [None]:
class_weights = (None, {0: 1, 1: 5}, {0: 1, 1: 10}, "balanced")
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
axs = axs.flatten()
for ax, class_weight in zip(axs, class_weights):
    model = LogisticRegression(random_state=RANDOM_SEED, class_weight=class_weight)
    model.fit(X_train, y_train)
    ax.hist(model.predict_proba(X_val)[:, 1], bins=51)
    ax.semilogy()
    ax.set_title(f"Class Weight = {class_weight}")
    ax.set_xlabel("Prediction Probability")
    None

fig.tight_layout()

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
axs = axs.flatten()
for ax, class_weight in zip(axs, class_weights):
    model = LogisticRegression(random_state=RANDOM_SEED, class_weight=class_weight)
    model.fit(X_train, y_train)
    precision, recall, thresholds = precision_recall_curve(
        y_val, model.predict_proba(X_val)[:, 1]
    )
    ax.plot(thresholds, precision[:-1])
    ax.plot(thresholds, recall[:-1])
    ax.plot(
        thresholds, 2 * precision[:-1] * recall[:-1] / (precision[:-1] + recall[:-1])
    )

    ax.legend(["Precision", "Recall", r"$F_{1}$"])
    ax.set_xlabel("Threshold")
    ax.set_title(f"Class Weight = {class_weight}")
    None

fig.tight_layout()