### Oversmapling & Undersampling using sklearn package

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('HR_comma_sep.csv')

In [3]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
data.left.count()

14999

In [5]:
data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [6]:
data.left.value_counts()/data.left.count()

0    0.761917
1    0.238083
Name: left, dtype: float64

In [7]:
from sklearn.utils import resample

In [8]:
# Separate majority and minority classes
df_majority = data[data.left==0]
df_minority = data[data.left==1]

In [11]:
df_majority.left.value_counts()

0    11428
Name: left, dtype: int64

In [12]:
df_minority.left.value_counts()

1    3571
Name: left, dtype: int64

In [13]:
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=11428,    # to match majority class
                                 random_state=123) # reproducible results

In [14]:
df_minority.left.count()

3571

In [15]:
df_minority_upsampled.left.count()

11428

In [19]:
df_minority_upsampled.duplicated().value_counts()

True     9479
False    1949
dtype: int64

In [20]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.left.value_counts()

1    11428
0    11428
Name: left, dtype: int64

In [21]:
df_minority.left.count()

3571

In [22]:
df_majority.left.count()

11428

In [23]:
#Undersample
# Upsample majority class
df_majority_upsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=3571,    # to match majority class
                                 random_state=123) # reproducible results

In [24]:
df_majority_upsampled.left.value_counts()

0    3571
Name: left, dtype: int64

In [26]:
df_majority_upsampled.duplicated().value_counts()

False    3441
True      130
dtype: int64

In [27]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority, df_majority_upsampled])
 
# Display new class counts
df_upsampled.left.value_counts()

1    3571
0    3571
Name: left, dtype: int64

### Oversmapling using imblearn package

In [28]:
import imblearn.over_sampling as over_sampling

In [29]:
y = data[['left']]
X = data[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 
       'promotion_last_5years']]

In [30]:
y.sum()

left    3571
dtype: int64

In [31]:
from imblearn.over_sampling import SMOTE

In [32]:
sm = SMOTE(random_state=42) # Initlaize the SMOTE Model

In [33]:
X_res, y_res = sm.fit_sample(X,y) # Fit the SMOTE Model on the X and Y columns / features

  y = column_or_1d(y, warn=True)


In [34]:
y_new = pd.Series(y_res)

In [37]:
y_new.value_counts()

1    11428
0    11428
dtype: int64

### Text Mining - News articles classification

In [None]:
"""
=================================================
Example of topic classification in text documents
=================================================

This example shows how to balance the text data before to train a classifier.

Note that for this example, the data are slightly imbalanced but it can happen
that for some data sets, the imbalanced ratio is more significant.

"""

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

###############################################################################
# Setting the data set
###############################################################################

###############################################################################
# We use a part of the 20 newsgroups data set by loading 4 topics. Using the
# scikit-learn loader, the data are split into a training and a testing set.
#
# Note the class \#3 is the minority class and has almost twice less samples
# than the majority class.

categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories)

X_train = newsgroups_train.data
X_test = newsgroups_test.data

y_train = newsgroups_train.target
y_test = newsgroups_test.target

print('Training class distributions summary: {}'.format(Counter(y_train)))
print('Test class distributions summary: {}'.format(Counter(y_test)))

###############################################################################
# The usual scikit-learn pipeline
###############################################################################

###############################################################################
# You might usually use scikit-learn pipeline by combining the TF-IDF
# vectorizer to feed a multinomial naive bayes classifier. A classification
# report summarized the results on the testing set.
#
# As expected, the recall of the class \#3 is low mainly due to the class
# imbalanced.

pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))

###############################################################################
# Balancing the class before classification
###############################################################################

###############################################################################
# To improve the prediction of the class \#3, it could be interesting to apply
# a balancing before to train the naive bayes classifier. Therefore, we will
# use a ``RandomUnderSampler`` to equalize the number of samples in all the
# classes before the training.
#
# It is also important to note that we are using the ``make_pipeline`` function
# implemented in imbalanced-learn to properly handle the samplers.

pipe = make_pipeline_imb(TfidfVectorizer(),
                         RandomUnderSampler(),
                         MultinomialNB())

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

###############################################################################
# Although the results are almost identical, it can be seen that the resampling
# allowed to correct the poor recall of the class \#3 at the cost of reducing
# the other metrics for the other classes. However, the overall results are
# slightly better.

print(classification_report_imbalanced(y_test, y_pred))
