In [None]:
%matplotlib inline


# Metrics specific to imbalanced learning


Specific metrics have been developed to evaluate classifier which
has been trained using imbalanced data. `imblearn` provides mainly
two additional metrics which are not implemented in `sklearn`: (i)
geometric mean and (ii) index balanced accuracy.

Se han desarrollado métricas específicas para evaluar el clasificador que
ha sido entrenado usando datos desequilibrados. `imblearn` proporciona principalmente
dos métricas adicionales que no están implementadas en `sklearn`: (i)
media geométrica y (ii) precisión equilibrada del índice.



In [1]:
#IMportando Librerias
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn import over_sampling as os
from imblearn import pipeline as pl
from imblearn.metrics import (geometric_mean_score,
                              make_index_balanced_accuracy)

In [2]:
print(__doc__)
RANDOM_STATE = 42

# Generate a dataset
X, y = datasets.make_classification(n_classes=3, class_sep=2,
                                    weights=[0.1, 0.9], n_informative=10,
                                    n_redundant=1, flip_y=0, n_features=20,
                                    n_clusters_per_class=4, n_samples=5000,
                                    random_state=RANDOM_STATE)
print("X: ", X)
print("y:", y)

Automatically created module for IPython interactive environment
X:  [[ 1.93014748 -1.66089547 -0.38084743 ..., -2.73212275 -0.59345283
  -1.82735931]
 [ 2.91928918 -4.90325242 -1.8219177  ..., -2.19117273  0.19930255
  -0.2654712 ]
 [ 0.51705259 -2.29323585 -1.30477851 ...,  4.08086622 -0.75239397
  -1.35175836]
 ..., 
 [ 2.68056563 -3.06298718  1.63033663 ...,  1.03479442 -0.40336959
  -0.29926872]
 [-0.6628685  -0.9162271  -1.2739332  ...,  1.63878977  1.78474841
   0.27198568]
 [-0.8644855   2.13395125 -0.52420666 ..., -1.72337071 -0.82731053
   0.36980908]]
y: [1 0 1 ..., 1 1 0]


In [None]:
porcentaje_artitrion = (
    BBVA_data[BBVA_data.ATTRITION > 0]['ATTRITION'].count() * 1.0
       / BBVA_data['ATTRITION'].count()
) * 100.0
print("El procentaje de ARTITRION de la base de datos es {0:.2f}%".format(porcentaje_artitrion))

In [13]:
sum(y)/len(y)

0.90000000000000002

In [14]:
pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
                            LinearSVC(random_state=RANDOM_STATE))
pipeline

Pipeline(memory=None,
     steps=[('smote', SMOTE(k=None, k_neighbors=5, kind='regular', m=None, m_neighbors=10, n_jobs=1,
   out_step=0.5, random_state=42, ratio='auto', svm_estimator=None)), ('linearsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0))])

In [15]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)

# Train the classifier with balancing
pipeline.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
y_pred_bal

array([1, 1, 0, ..., 1, 1, 1])

In [16]:
sum(y_pred_bal)/len(y_pred_bal)

0.84240000000000004

The geometric mean corresponds to the square root of the product of the
sensitivity and specificity. Combining the two metrics should account for
the balancing of the dataset.
////////////////////////////////////////////////////////////////////////////////
La media geométrica corresponde a la raíz cuadrada del producto del
sensibilidad y especificidad. La combinación de las dos métricas debe tener en cuenta
el equilibrio del conjunto de datos.



In [17]:
print('The geometric mean is {}'.format(geometric_mean_score(y_test, y_pred_bal)))

The geometric mean is 0.9262633940760341


The index balanced accuracy can transform any metric to be used in
imbalanced learning problems.



In [18]:
alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
    geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
    alpha, geo_mean(
        y_test,
        y_pred_bal)))

alpha = 0.5
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
    geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
    alpha, geo_mean(
        y_test,
        y_pred_bal)))

The IBA using alpha = 0.1 and the geometric mean: 0.8579638752052544
The IBA using alpha = 0.5 and the geometric mean: 0.8579638752052544
