In [1]:
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import pandas as pd
import numpy as np
import warnings

In [2]:
df = pd.read_csv("ensemble_data.csv")
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
cols_to_transform = [ 'cap_shape', 'cap_surface','cap_color','bruises','odor','gill_attachment','gill_spacing','gill_size','gill_color','stalk_shape','stalk_root','stalk_surface_above_ring','stalk_surface_below_ring','stalk_color_above_ring','stalk_color_below_ring','veil_type','veil_color','ring_number','ring_type','spore_print_color','population','habitat' ]
df_with_dummies = pd.get_dummies( df,columns = cols_to_transform )
df_with_dummies.head()

Unnamed: 0,type,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,cap_surface_g,cap_surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,p,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,e,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,p,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [4]:
df_new = df_with_dummies.drop(['type'], axis=1)
df_new

Unnamed: 0,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,cap_surface_g,cap_surface_s,cap_surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
6,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
7,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
9,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0


In [5]:
df_label= df_with_dummies['type'] 
cols_to_transform = [ 'type']
df_with_dummies = pd.get_dummies( df_label,columns = cols_to_transform )
df_with_dummies.head()
df_label = df_with_dummies.drop(['e'], axis=1)
df_label

Unnamed: 0,p
0,1
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,1
9,0


In [None]:
from sklearn import metrics
warnings.simplefilter('ignore')
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                  n_estimators=100, max_depth=3)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3,clf4],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, clf4, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'XGBClassifier',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, df_new,df_label, 
                                              cv=5, scoring='accuracy')
    scores_f1 = model_selection.cross_val_score(clf,df_new,df_label, 
                                              cv=5, scoring='f1')
    scores_p = model_selection.cross_val_score(clf,df_new,df_label, 
                                              cv=5, scoring='precision')
    scores_r = model_selection.cross_val_score(clf,df_new,df_label, 
                                              cv=5, scoring='recall')
    print(label)
    print("Accuracy: %0.2f (+/- %0.2f)" 
          % (scores.mean(), scores.std()))
    
    print("F1 Score: %0.2f (+/- %0.2f) " 
          % (scores_f1.mean(), scores_f1.std()))
    print("Precision: %0.2f (+/- %0.2f) " 
          % (scores_p.mean(), scores_p.std()))
    print("Recall: %0.2f (+/- %0.2f) " 
          % (scores_r.mean(), scores_r.std()))

5-fold cross validation:

KNN
Accuracy: 0.93 (+/- 0.09)
F1 Score: 0.93 (+/- 0.09) 
Precision: 0.94 (+/- 0.12) 
Recall: 0.94 (+/- 0.13) 
Random Forest
Accuracy: 0.93 (+/- 0.09)
F1 Score: 0.92 (+/- 0.09) 
Precision: 0.94 (+/- 0.12) 
Recall: 0.93 (+/- 0.13) 
Naive Bayes
Accuracy: 0.85 (+/- 0.12)
F1 Score: 0.87 (+/- 0.10) 
Precision: 0.84 (+/- 0.16) 
Recall: 0.93 (+/- 0.13) 
XGBClassifier
Accuracy: 0.97 (+/- 0.06)
F1 Score: 0.97 (+/- 0.05) 
Precision: 0.95 (+/- 0.09) 
Recall: 1.00 (+/- 0.00) 
