In [25]:
from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from mlxtend.plotting import plot_decision_regions
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE ,SMOTENC
from collections import Counter
from imblearn.keras import BalancedBatchGenerator
import tensorflow
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization

Vejamos como que um modelo de classificação fica prejudicado quando temos um conjunto de dados cujas classes possuem um problema de imbalance. Abaixo criamos o banco de dados. A seguir definimos o modelo de deep learning a ser aplicado e, por fim, rodamos o mesmo para os dados com imbalance.

In [18]:
X,y =make_classification(n_samples=5000, n_features=2, n_informative=2, n_classes=3, n_clusters_per_class=1,
                        n_redundant=0,n_repeated=0,weights=[0.01,0.05,0.94],class_sep=0.8,random_state=0)
print("Dados em cada uma das classes",sorted(Counter(y).items()))

Dados em cada uma das classes [(0, 64), (1, 262), (2, 4674)]


In [19]:
def make_model(n_features):
    model = Sequential()
    model.add(Dense(200, input_shape=(n_features,),
              kernel_initializer='glorot_normal'))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(100, kernel_initializer='glorot_normal'))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))
    model.add(Dense(50, kernel_initializer='glorot_normal'))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.15))
    model.add(Dense(25, kernel_initializer='glorot_normal'))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [20]:
import time
from functools import wraps


def timeit(f):
    @wraps(f)
    def wrapper(*args, **kwds):
        start_time = time.time()
        result = f(*args, **kwds)
        elapsed_time = time.time() - start_time
        print('Elapsed computation time: {:.3f} secs'
              .format(elapsed_time))
        return (elapsed_time, result)
    return wrapper

In [39]:
# Estimando o modelo de ANN sem correcao dos dados por imbalance
@timeit
def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=50, verbose=0, batch_size=100)
    y_pred = model.predict_proba(X_test, batch_size=1000)
    print("Score:", balanced_accuracy_score(y_test,y_pred))
    return y_pred

In [42]:
# Corrigindo o imbalance data e depois estimando o modelo ANN
@timeit
def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    training_generator = BalancedBatchGenerator(X_train, y_train,
                                                batch_size=100,
                                                random_state=42)
    model.fit_generator(generator=training_generator, epochs=50, verbose=0)
    y_pred = model.predict_proba(X_test)
    print("Score:", balanced_accuracy_score(y_test,y_pred))
    return y_pred

In [37]:
x_train, x_test, y_train, y_test = train_test_split(X,y,random_state=0)
print("Dados para teste: ",sorted(Counter(y_test).items()))

Dados para teste:  [(0, 12), (1, 59), (2, 1179)]


In [40]:
m1=fit_predict_imbalanced_model(x_train, y_train, x_test, y_test)

Score: 0.5833333333333334
Elapsed computation time: 10.456 secs
