In [1]:
# Handling Imbalanced Datasets Example
# Import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Create an imbalanced dataset
X, y = make_classification(n_classes=2, class_sep=2, 
                           weights=[0.9, 0.1], n_informative=3, 
                           n_redundant=1, flip_y=0, n_features=5, 
                           n_clusters_per_class=1, n_samples=1000, random_state=42)

# Check class distribution
print("Class Distribution before resampling:", pd.Series(y).value_counts())

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Baseline model without handling imbalance
baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train, y_train)
baseline_predictions = baseline_model.predict(X_test)
print("\nBaseline Model Performance:")
print(classification_report(y_test, baseline_predictions))

# 1. Oversampling using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("\nClass Distribution after SMOTE:", pd.Series(y_resampled).value_counts())

smote_model = RandomForestClassifier(random_state=42)
smote_model.fit(X_resampled, y_resampled)
smote_predictions = smote_model.predict(X_test)
print("\nSMOTE Model Performance:")
print(classification_report(y_test, smote_predictions))

# 2. Undersampling
undersampler = RandomUnderSampler(random_state=42)
X_resampled_under, y_resampled_under = undersampler.fit_resample(X_train, y_train)
print("\nClass Distribution after Undersampling:", pd.Series(y_resampled_under).value_counts())

under_model = RandomForestClassifier(random_state=42)
under_model.fit(X_resampled_under, y_resampled_under)
under_predictions = under_model.predict(X_test)
print("\nUndersampling Model Performance:")
print(classification_report(y_test, under_predictions))

# 3. Class Weight Adjustment
weighted_model = RandomForestClassifier(class_weight='balanced', random_state=42)
weighted_model.fit(X_train, y_train)
weighted_predictions = weighted_model.predict(X_test)
print("\nWeighted Model Performance:")
print(classification_report(y_test, weighted_predictions))


Class Distribution before resampling: 0    900
1    100
Name: count, dtype: int64

Baseline Model Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       271
           1       1.00      1.00      1.00        29

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


Class Distribution after SMOTE: 0    629
1    629
Name: count, dtype: int64

SMOTE Model Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       271
           1       1.00      1.00      1.00        29

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


Class Distribution after Undersampling: 0    71
1    71
Name: count, dtype: int64

Undersampling Model Performance:
              p