In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
X, y = make_classification(
    n_samples=5000, n_features=2, n_informative=2,
    n_redundant=0, n_repeated=0, n_classes=2,
    n_clusters_per_class=1,
    weights=[0.01, 0.99],
    class_sep=0.8, random_state=123
)

In [7]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy='minority') # not majority는 제일 많은 클래스를 제외하고 샘플링
X_res, y_res = ros.fit_resample(X, y)

print(pd.Series(y).value_counts())
print(pd.Series(y_res).value_counts())

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='majority') # not minority는 제일 적은 클래스를 제외하고 샘플링
X_res, y_res = rus.fit_resample(X, y)

print(pd.Series(y).value_counts())
print(pd.Series(y_res).value_counts())

1    4919
0      81
Name: count, dtype: int64
1    4919
0    4919
Name: count, dtype: int64
1    4919
0      81
Name: count, dtype: int64
0    81
1    81
Name: count, dtype: int64


In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# Apply SMOTE to the dataset
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Print the value counts to see the effect of SMOTE
print(pd.Series(y).value_counts())
print(pd.Series(y_smote).value_counts())

# Apply Tomek Links to the dataset
tomek = TomekLinks(sampling_strategy='majority')
X_tomek, y_tomek = tomek.fit_resample(X, y)

# Print the value counts to see the effect of Tomek Links
print(pd.Series(y).value_counts())
print(pd.Series(y_tomek).value_counts())

1    4919
0      81
Name: count, dtype: int64
1    4919
0    4919
Name: count, dtype: int64
1    4919
0      81
Name: count, dtype: int64
1    4895
0      81
Name: count, dtype: int64
