# Avaliação de desempenho de técnicas de data-sampling

Foi feita a avaliação das técnicas de over e undersampling estudadas até o momento, assim como a combinação delas. Para a análise, foi utilizado um dataset sintético com duas classes, onde cada uma destas têm apenas 2 atributos.

Para avaliar o desempenho do dataset antes e após a aplicação das técnicas, foi utilizado o modelo de Árvore de decisão classificadora e aplicado a validação cruzada estratificada k-fold repetida (Repeated Stratified k-Fold Cross Validation) para testar o modelo.

https://docs.google.com/spreadsheets/d/1fPq-6OO-bxIfi3RcUDgraS-V401GdcVEBhR5fK90Sjo/edit?usp=sharing

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from matplotlib import pyplot as plt
from numpy import where
from numpy import mean
# imports for validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
# import of oversampling techniques
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN
# import of undersampling techniques
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

In [None]:
### Criação do dataset desbalanceado
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99],
                           flip_y=0, random_state=1)
counter = Counter(y)
print(counter)

# define model
model = DecisionTreeClassifier()

########### evaluate original dataset #########
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
original_scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=1)
print('AUROC original DS: %.3f' % mean(original_scores))
###############################################

Counter({0: 9900, 1: 100})
AUROC original DS: 0.771


In [None]:
pipelines = {
    'Original Data': Pipeline(steps=[('m', model)]),
    'SMOTE': Pipeline(steps=[('o', SMOTE()), ('m', model)]),
    'Boderline SMOTE': Pipeline(steps=[('o', BorderlineSMOTE()), ('m', model)]),
    'ADASYN': Pipeline(steps=[('o', ADASYN()), ('m', model)]),
    '\nNearMiss 1': Pipeline(steps=[('u', NearMiss(version=1)), ('m', model)]),
    'NearMiss 2': Pipeline(steps=[('u', NearMiss(version=2)), ('m', model)]),
    'NearMiss 3': Pipeline(steps=[('u', NearMiss(version=3)), ('m', model)]),
    'CNN': Pipeline(steps=[('u', CondensedNearestNeighbour(n_neighbors=1)), ('m', model)]),
    'TomekLinks': Pipeline(steps=[('u', TomekLinks()), ('m', model)]),
    'ENN': Pipeline(steps=[('u', EditedNearestNeighbours()), ('m', model)]),
    'OSS': Pipeline(steps=[('u', OneSidedSelection(n_neighbors=1, n_seeds_S=200)), ('m', model)]),
    'NCR': Pipeline(steps=[('u', NeighbourhoodCleaningRule()), ('m', model)]),
    '\nSMOTE + NearMiss 1': Pipeline(steps=[('o', SMOTE()), ('u', NearMiss(version=1)), ('m', model)]),
    'SMOTE + NearMiss 2': Pipeline(steps=[('o', SMOTE()), ('u', NearMiss(version=2)), ('m', model)]),
    'SMOTE + NearMiss 3': Pipeline(steps=[('o', SMOTE()), ('u', NearMiss(version=3)), ('m', model)]),
    'SMOTE + CNN': Pipeline(steps=[('o', SMOTE()), ('u', CondensedNearestNeighbour(n_neighbors=1)), ('m', model)]),
    'SMOTE + TomekLinks': Pipeline(steps=[('o', SMOTE()), ('u', TomekLinks()), ('m', model)]),
    'SMOTE + ENN': Pipeline(steps=[('o', SMOTE()), ('u', EditedNearestNeighbours()), ('m', model)]),
    'SMOTE + OSS': Pipeline(steps=[('o', SMOTE()), ('u', OneSidedSelection(n_neighbors=1, n_seeds_S=200)), ('m', model)]),
    'SMOTE + NCR': Pipeline(steps=[('o', SMOTE()), ('u', NeighbourhoodCleaningRule()), ('m', model)]),
}

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate models
for technique, pipeline in pipelines.items():
  scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
  print(f'{technique}: %.3f' % mean(scores))

Counter({0: 9900, 1: 100})
AUROC original DS: 0.768
Original Data: 0.769
SMOTE: 0.829
Boderline SMOTE: 0.817
ADASYN: 0.812

NearMiss 1: 0.553
NearMiss 2: 0.467
NearMiss 3: 0.719
CNN: 0.758
TomekLinks: 0.792
ENN: 0.820
OSS: 0.793
NCR: 0.825

SMOTE + NearMiss 1: 0.821
SMOTE + NearMiss 2: 0.829
SMOTE + NearMiss 3: 0.693
SMOTE + CNN: 0.803
SMOTE + TomekLinks: 0.812
SMOTE + ENN: 0.837
SMOTE + OSS: 0.811
SMOTE + NCR: 0.831


In [None]:
pipelines = {
    '\nBorderline SMOTE + NearMiss 1': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', NearMiss(version=1)), ('m', model)]),
    'Borderline SMOTE + NearMiss 2': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', NearMiss(version=2)), ('m', model)]),
    'Borderline SMOTE + NearMiss 3': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', NearMiss(version=3)), ('m', model)]),
    'Borderline SMOTE + CNN': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', CondensedNearestNeighbour(n_neighbors=1)), ('m', model)]),
    'Borderline SMOTE + TomekLinks': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', TomekLinks()), ('m', model)]),
    'Borderline SMOTE + ENN': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', EditedNearestNeighbours()), ('m', model)]),
    'Borderline SMOTE + OSS': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', OneSidedSelection(n_neighbors=1, n_seeds_S=200)), ('m', model)]),
    'Borderline SMOTE + NCR': Pipeline(steps=[('o', BorderlineSMOTE()), ('u', NeighbourhoodCleaningRule()), ('m', model)]),
}

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate models
for technique, pipeline in pipelines.items():
  scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
  print(f'{technique}: %.3f' % mean(scores))


Borderline SMOTE + NearMiss 1: 0.814
Borderline SMOTE + NearMiss 2: 0.815
Borderline SMOTE + NearMiss 3: 0.731
Borderline SMOTE + CNN: 0.792
Borderline SMOTE + TomekLinks: 0.811
Borderline SMOTE + ENN: 0.816
Borderline SMOTE + OSS: 0.804
Borderline SMOTE + NCR: 0.814


In [None]:
pipelines = {
    'ADASYN + NearMiss 1': Pipeline(steps=[('o', ADASYN()), ('u', NearMiss(version=1)), ('m', model)]),
    'ADASYN + NearMiss 2': Pipeline(steps=[('o', ADASYN()), ('u', NearMiss(version=2)), ('m', model)]),
    'ADASYN + NearMiss 3': Pipeline(steps=[('o', ADASYN()), ('u', NearMiss(version=3)), ('m', model)]),
    'ADASYN + CNN': Pipeline(steps=[('o', ADASYN()), ('u', CondensedNearestNeighbour(n_neighbors=1)), ('m', model)]),
    'ADASYN + TomekLinks': Pipeline(steps=[('o', ADASYN()), ('u', TomekLinks()), ('m', model)]),
    'ADASYN + ENN': Pipeline(steps=[('o', ADASYN()), ('u', EditedNearestNeighbours()), ('m', model)]),
    'ADASYN + OSS': Pipeline(steps=[('o', ADASYN()), ('u', OneSidedSelection(n_neighbors=1, n_seeds_S=200)), ('m', model)]),
    'ADASYN + NCR': Pipeline(steps=[('o', ADASYN()), ('u', NeighbourhoodCleaningRule()), ('m', model)]),
}

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate models
for technique, pipeline in pipelines.items():
  scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
  print(f'{technique}: %.3f' % mean(scores))

ADASYN + NearMiss 1: 0.814
ADASYN + NearMiss 2: 0.809
ADASYN + NearMiss 3: 0.718
ADASYN + CNN: 0.793
ADASYN + TomekLinks: 0.817
ADASYN + ENN: 0.834
ADASYN + OSS: 0.805
ADASYN + NCR: 0.826
