In [1]:
!pip install imblearn



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('http://www.ppgia.pucpr.br/~jean.barddal/datascience/imbalanced.csv')

In [4]:
df.head(3)

Unnamed: 0,attrib1,attrib2,attrib3,class
0,1.595781,5.540723,9.109505,groupA
1,2.657896,1.166703,1.368305,groupA
2,0.901466,4.374227,3.207844,groupA


In [6]:
# analisar o desbalanceamento
df['class'].value_counts() / len(df)

groupA    0.902
groupB    0.098
Name: class, dtype: float64

In [8]:
# dividir a base em treinamento e teste
from sklearn.model_selection import train_test_split
X, y = df.drop('class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [15]:
# Criar um modelo para verificar o comportamento antes de qualquer pre-processamento
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [10]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
# aplicando o modelo nos dados de teste
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.9966666666666667

In [14]:
print(classification_report(y_test, y_pred))
# support: quantos exemplos/instancias temos de cada classe nos dados de teste
# precision: nos dá uma ideia de confiabilidade do modelo para aquela classe
#            de todos os casos que o modelo previu como sendo de uma classe, quantos de fato eram
# recall: nos dá uma ideia de "abrangencia" do modelo
#          de todos as instancias de uma classe, quantas o modelo conseguiu prever como sendo daquela classe

              precision    recall  f1-score   support

      groupA       1.00      1.00      1.00       268
      groupB       1.00      0.97      0.98        32

    accuracy                           1.00       300
   macro avg       1.00      0.98      0.99       300
weighted avg       1.00      1.00      1.00       300



In [16]:
confusion_matrix(y_test, y_pred)

array([[268,   0],
       [  1,  31]])

In [17]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss

In [18]:
samplers = {'RO': RandomOverSampler(),
            'SMOTE': SMOTE(),
            'RU': RandomUnderSampler(),
            'NM1': NearMiss(version=1),
            'NM2': NearMiss(version=2),
            'NM3': NearMiss(version=3)}

In [19]:
for nome, sampler in samplers.items():
  # aplicar a tecnica APENAS NOS DADOS DE TREINO!
  X_sampled, y_sampled = sampler.fit_resample(X_train, y_train)
  # verificar o percentual das classes
  print(nome)
  print(pd.Series(y_sampled).value_counts() / len(y_sampled))

  # treinar o modelo com os dados reamostrados
  lr.fit(X_sampled, y_sampled)
  y_pred = lr.predict(X_test) # DADOS DE TESTE INTACTOS!
  print(accuracy_score(y_test, y_pred))
  print(classification_report(y_test, y_pred))


RO
groupA    0.5
groupB    0.5
dtype: float64
0.97
              precision    recall  f1-score   support

      groupA       1.00      0.97      0.98       268
      groupB       0.78      1.00      0.88        32

    accuracy                           0.97       300
   macro avg       0.89      0.98      0.93       300
weighted avg       0.98      0.97      0.97       300

SMOTE
groupA    0.5
groupB    0.5
dtype: float64
1.0
              precision    recall  f1-score   support

      groupA       1.00      1.00      1.00       268
      groupB       1.00      1.00      1.00        32

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

RU
groupA    0.5
groupB    0.5
dtype: float64
0.9333333333333333
              precision    recall  f1-score   support

      groupA       1.00      0.93      0.96       268
      groupB       0.62      1.00      0.76        32

    accuracy 

