In [1]:
import imblearn

In [2]:
imblearn.__version__

'0.9.0'

In [4]:
# https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

## Random Oversampling Imbalanced Data Sets

In [3]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

# Define Data set
x,y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

print(Counter(y))

Counter({0: 9900, 1: 100})


In [6]:
Counter([1,2,3,1,4,5,5,6,6,6,6,6,6,66,6,6,6])

Counter({1: 2, 2: 1, 3: 1, 4: 1, 5: 2, 6: 9, 66: 1})

In [9]:
# Define oversampling Strategy
oversample = RandomOverSampler(sampling_strategy= "minority")

# Fit and apply the transform
x_over , y_over = oversample.fit_resample(x,y)
print(Counter(y_over))

Counter({0: 9900, 1: 9900})


## We Increased the Sample 

In [12]:
# Example of evaluating a decision tree with random oversampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

# Define the data sets
x,y = make_classification(n_samples=10000, weights=[0.9], flip_y=0)

# Define the Pipe line
steps = [('over', RandomOverSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)

# Evaluate the Pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x,y,scoring="f1_micro", cv = cv, n_jobs=-1,)
score = mean(scores)
print(score)

0.9625


## Random Undersampling Imabalanced Datasets

In [13]:
from imblearn.under_sampling import RandomUnderSampler

# Define Data Sets
x,y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

# Define the PipeLine
steps = [("under", RandomUnderSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps = steps)

# Evaluate the Model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x,y, scoring='f1_micro', cv = cv, n_jobs=-1)

score = mean(scores)
print("F1 Score : %.3f" %score)

F1 Score : 0.958


In [14]:
Counter(y)

Counter({0: 9900, 1: 100})

## Combining Over Sampling & Under Sampling

In [15]:
# Example of evaluating  a model with random oversampling and undersampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

## Define data sets
x,y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

## Define Pipe Line
over = RandomOverSampler(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

steps = [('o', over), ('u', under), ('m', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)

# Evaluate the Pipe Line
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x, y, scoring='f1_micro', cv = cv, n_jobs=-1)

score = mean(scores)

print("F1 Score : %.3f" % score)

F1 Score : 0.977
