@author Joubert Alexandrino de Souza
@version 2021-04-03

---------------------------------------------------
## Comparação de Ensembles
---------------------------------------------------

Em um Jupyter Notebook:

    Use uma base de dados para classificação com pelo menos 1000 amostras;
    Escolha pelo menos três algoritmos de classificação;
    Combine os classificadores de duas formas diferentes:
        Voting
        Stacking
    Use gridsearch (ou randomsearch) para ajustar tantos os classificadores fracos quanto o ensemble;
    Coloque o Jupyter Notebook no seu GitHub.


In [52]:
#Importa as bibliotecas necessárias
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
import warnings

!pip install xgboost



In [53]:
#Realiza a leitura dos dados
dados = pd.read_csv("https://raw.githubusercontent.com/joubert-alexandrino/reconhecimento-padroes/main/mushrooms.csv")
dados.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [54]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [55]:
dados.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [56]:
#Realiza a conversão dos dados rotulados para numericos
labelenconder = LabelEncoder()
for caracteristica in dados.columns:
    dados[caracteristica] = labelenconder.fit_transform(dados[caracteristica])

dados.head()


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [57]:
#Realiza a divisão de um conjunto de treinamento e teste 
X, y = dados.drop(['class'], axis=1, inplace=False), dados['class']
Xtreino, Xteste, ytreino, yteste = train_test_split(X, y, random_state=42, shuffle=True)
Xtreino.shape, Xteste.shape, ytreino.shape, yteste.shape

((6093, 22), (2031, 22), (6093,), (2031,))

## Voting

In [58]:
#Configura os estimadores e realiza seu treinamento com validação cruzada
estimadores =  VotingClassifier([
    ('KNN', KNeighborsClassifier()),    
    ('TREE', DecisionTreeClassifier()),
    ('LOGISTIC', LogisticRegression())
], n_jobs=-1)

pipe = Pipeline([
    ('ensemble', estimadores)
])

parametros = {
    
    'ensemble__KNN__n_neighbors':[3,5,7]
}

modelo = GridSearchCV(pipe, param_grid=parametros, return_train_score = True, n_jobs=-1)

scores = cross_validate(modelo, Xtreino, ytreino, return_estimator=True)
#Exibe os resultados do treinamento
scores, np.mean(scores['test_score'])

({'fit_time': array([2.02931404, 2.02196169, 1.99002099, 2.04108429, 2.03601933]),
  'score_time': array([0.0929544 , 0.08791137, 0.09773231, 0.08906364, 0.09117818]),
  'estimator': (GridSearchCV(estimator=Pipeline(steps=[('ensemble',
                                           VotingClassifier(estimators=[('KNN',
                                                                         KNeighborsClassifier()),
                                                                        ('TREE',
                                                                         DecisionTreeClassifier()),
                                                                        ('LOGISTIC',
                                                                         LogisticRegression())],
                                                            n_jobs=-1))]),
                n_jobs=-1, param_grid={'ensemble__KNN__n_neighbors': [3, 5, 7]},
                return_train_score=True),
   GridSearchCV(estimator

In [59]:

#Realiza a predição
melhor_classificador = scores['estimator'][np.argmax(scores['test_score'])].best_estimator_
ypred = melhor_classificador.predict(Xteste)
hits = yteste == ypred
hits, sum(hits)/len(hits)

(1971    True
 6654    True
 5606    True
 3332    True
 6988    True
         ... 
 1966    True
 5103    True
 51      True
 7724    True
 6211    True
 Name: class, Length: 2031, dtype: bool,
 0.999015263417036)

## Stacking

In [60]:
#Desabilita os warnings
warnings.filterwarnings('ignore')
#Configura os estimadores e realiza seu treinamento com validação cruzada
estimadores =  XGBClassifier(n_jobs=-1)

pipe = Pipeline([
    ('ensemble', estimadores)
])

parametros = {
    
    #'ensemble__KNN__n_neighbors':[3,5,7]
    'ensemble__use_label_encoder':[True, False]
}

modelo = GridSearchCV(pipe, param_grid=parametros, return_train_score = True, n_jobs=-1)

scores = cross_validate(modelo, Xtreino, ytreino, return_estimator=True)
#Exibe os resultados do treinamento
scores, np.mean(scores['test_score'])



({'fit_time': array([5.4812932 , 5.11215806, 5.01897573, 5.31384206, 5.17748666]),
  'score_time': array([0.00436187, 0.00463367, 0.00398946, 0.004529  , 0.00650048]),
  'estimator': (GridSearchCV(estimator=Pipeline(steps=[('ensemble',
                                           XGBClassifier(base_score=None,
                                                         booster=None,
                                                         colsample_bylevel=None,
                                                         colsample_bynode=None,
                                                         colsample_bytree=None,
                                                         gamma=None, gpu_id=None,
                                                         importance_type='gain',
                                                         interaction_constraints=None,
                                                         learning_rate=None,
                                                   

In [61]:
melhor_classificador = scores['estimator'][np.argmax(scores['test_score'])].best_estimator_
ypred = melhor_classificador.predict(Xteste)
hits = yteste == ypred
hits, sum(hits)/len(hits)

(1971    True
 6654    True
 5606    True
 3332    True
 6988    True
         ... 
 1966    True
 5103    True
 51      True
 7724    True
 6211    True
 Name: class, Length: 2031, dtype: bool,
 1.0)