In [1]:
# evaluate imbalanced classification model with different metrics
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
X,y = make_classification(n_samples=10000,n_features=2,n_redundant=0,n_clusters_per_class=1,weights=[.99],flip_y=0)

# split into train and test sets with same ratio

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,stratify=y)

In [4]:
print(X_train)
print(y_train)
print(X_test)
print(y_test)


[[ 0.41732934  0.55863665]
 [-1.4614058   1.35976584]
 [-0.58693624  0.9968201 ]
 ...
 [-1.59841118  1.17467489]
 [-1.41822929  1.05623893]
 [ 1.76410883 -2.46736992]]
[0 0 0 ... 0 0 1]
[[-0.71103517  0.78141823]
 [-0.88507578  1.09642749]
 [-0.8415209   0.80868124]
 ...
 [ 0.14057403  0.25558277]
 [-1.67388325  1.614675  ]
 [-1.17385146  0.98460554]]
[0 0 0 ... 0 0 0]


In [5]:
reg=LogisticRegression(solver='liblinear')


In [6]:
reg.fit(X_train,y_train)

LogisticRegression(solver='liblinear')

In [7]:
y_pred=reg.predict(X_test)
print(y_pred)

[0 0 0 ... 0 0 0]


# Evaluating predictions

In [8]:
print("Accuracy : ",accuracy_score(y_test,y_pred))

Accuracy :  0.9954


In [9]:
print("Precision : ",precision_score(y_test,y_pred))

Precision :  0.9655172413793104


In [10]:
print("Recall : ",recall_score(y_test,y_pred))

Recall :  0.56


In [11]:
print("F Measure : ",f1_score(y_test,y_pred))

F Measure :  0.7088607594936709


In [12]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import fbeta_score

In [13]:
print("ROC AUC score : ",roc_auc_score(y_test,y_pred))

ROC AUC score :  0.7798989898989899


In [14]:
print("FBeta score : ",fbeta_score(y_test,y_pred,beta=0.5))

FBeta score :  0.8433734939759036


# Performing under sampling 

A simple approach to using standard ML algorithms on an imbalance datasetis to change the training dataset to have more balanced class distribution

This can be achieved by deleting / selecting examples form majority class referred to as "Under sampling"

In [15]:
!pip install imbalanced-learn



In [16]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [17]:
X,y = make_classification(n_samples=10000,n_features=2,n_redundant=0,n_clusters_per_class=1,weights=[.99,0.01],flip_y=0)

In [18]:
print("Counter ",Counter(y))

Counter  Counter({0: 9900, 1: 100})


### define undersampled strategy

In [19]:
undersample = RandomUnderSampler(sampling_strategy=0.5)
print(undersample)

RandomUnderSampler(sampling_strategy=0.5)


# Fit and apply the transform

In [20]:
X_under,y_under = undersample.fit_resample(X,y)

In [21]:
print("Counter : ",Counter(y_under))

Counter :  Counter({0: 200, 1: 100})


# Over sampling the minority class using SMOTE (Synthetic Minority Oversampling Technique) algorithm

 the smote algorithm is popular approach for over samling the minority class, this technique can be used to reduce imbalanced or to make the class distribution even
    
 The eg below demonstrate using the smote class provided by the imbalanced learn library on a synthetic dataset. 

The initial class distribution is 1:100 and minority class is over sampled to 1:2 distribution

In [22]:
from imblearn.over_sampling import SMOTE

In [23]:
X,y = make_classification(n_samples=10000,n_features=2,n_redundant=0,n_clusters_per_class=1,weights=[.99,0.01],flip_y=0)

In [24]:
print("Counter ",Counter(y))

Counter  Counter({0: 9900, 1: 100})


In [25]:
oversample = SMOTE(sampling_strategy=0.5)
oversample

SMOTE(sampling_strategy=0.5)

Fit and apply the transform

In [26]:
X_over,y_over = oversample.fit_resample(X,y)


In [27]:
print("Counter ",Counter(y_over))

Counter  Counter({0: 9900, 1: 4950})


# Random Over sampling

Randomely duplicate examples in the minority class


Random over sampling involves randomly selecting examples from the minority class with replacement and adding them to training dataset.

# Random Under Sampling

Randomly delete examples in the minority class


Random under sampling involves randomly selecting examples from the minority class and deleting them from training dataset

In this the majority class instances are discarded at random until a more balanced distribution is reached

# Combine under sampling and over sampling

Data undersampling will delete examples from the majority class, whereas data oversampling will add examples to the majority class. These two approaches can be combined and used on a single training dataset.

Given that there are so many different data sampling techniques to choose from, it can be confusing as to which methods to combine. Thankfully, there are common combinations that have been shown to work well in practice; some examples include:


    Random Undersampling with SMOTE oversampling.
    Tomek Links Undersampling with SMOTE oversampling.
    Edited Nearest Neighbors Undersampling with SMOTE oversampling.


These combinations can be applied manually to a given training dataset by first applying one sampling algorithm, then another. Thankfully, the imbalanced-learn library provides implementations of common combined data sampling techniques.

The example below demonstrates how to use the SMOTEENN that combines both SMOTE oversampling of the minority class and Edited Nearest Neighbors undersampling of the majority class

In [28]:
from imblearn.combine import SMOTEENN

In [29]:
print("Counter ",Counter(y))

Counter  Counter({0: 9900, 1: 100})


In [30]:
sample = SMOTEENN(sampling_strategy=0.5)

In [31]:
X_over,y_over = sample.fit_resample(X,y)

In [32]:
print("Counter ",Counter(y_over))

Counter  Counter({0: 9492, 1: 4528})


# Cost sensitive algorithm for imbalanced dataset

Most machine learning algorithms assume that all misclassification errors made by a model are equal. This is often not the case for imbalanced classification problems, where missing a positive or minority class case is worse than incorrectly classifying an example from the negative or majority class.

Cost-sensitive learning is a subfield of machine learning that takes the costs of prediction errors (and potentially other costs) into account when training a machine learning model. Many machine learning algorithms can be updated to be cost-sensitive, where the model is penalized for misclassification errors from one class more than the other, such as the minority class.

The scikit-learn library provides this capability for a range of algorithms via the class_weight attribute specified when defining the model. A weighting can be specified that is inversely proportional to the class distribution.

If the class distribution was 0.99 to 0.01 for the majority and minority classes, then the class_weight argument could be defined as a dictionary that defines a penalty of 0.01 for errors made for the majority class and a penalty of 0.99 for errors made with the minority class, e.g. {0:0.01, 1:0.99}.

This is a useful heuristic and can be configured automatically by setting the class_weight argument to the string ‘balanced‘.

The example below demonstrates how to define and fit a cost-sensitive logistic regression model on an imbalanced classification dataset

In [33]:
X,y = make_classification(n_samples=10000,n_features=2,n_redundant=0,n_clusters_per_class=1,weights=[.99],flip_y=0)

In [34]:
reg2=LogisticRegression(solver='liblinear',class_weight='balanced')

In [35]:
reg2

LogisticRegression(class_weight='balanced', solver='liblinear')

In [36]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,stratify=y)

In [37]:
reg2.fit(X_train,y_train)

LogisticRegression(class_weight='balanced', solver='liblinear')

In [38]:
y_pred = reg2.predict(X_test)

In [39]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [40]:
print("F Measure : ",f1_score(y_test,y_pred))

F Measure :  0.6369426751592356
