# Mushrooms Classification using Adaboost & Gradient Boosting

## Step 1. Select Dataset

In [1]:
#dataset : Mushrooms Classification (https://www.kaggle.com/uciml/mushroom-classification)
import pandas as pd
mushrooms = pd.read_csv('./datasets/mushrooms/mushrooms.csv')
mushrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [2]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in mushrooms.columns:
    mushrooms[col] = labelencoder.fit_transform(mushrooms[col])
mushrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [3]:
X=mushrooms.loc[:,("bruises","gill-size","gill-color")]
X.head()

Unnamed: 0,bruises,gill-size,gill-color
0,1,1,4
1,1,0,4
2,1,0,5
3,1,1,5
4,0,0,4


In [4]:
X=X.values
y=mushrooms['class'].values

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
        ('scaling', StandardScaler()),
        ('onehot_encoder', OneHotEncoder(sparse=False,categories='auto')),
    ])
X_prepared=cat_pipeline.fit_transform(X)



In [6]:
#training set & test set 나누기
from sklearn.model_selection import train_test_split
import numpy as np
X_train , X_test , y_train , y_test = train_test_split(X_prepared,y,test_size=0.2,random_state=42)

## Step 2. Select a weak learner to use as estimator

In [7]:
import time
import numpy as np
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
sgd_time = time.time() 
sgd_clf.fit(X_train,y_train)
sgd_time = time.time() -sgd_time

from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=2)
knn_time = time.time() 
knn_clf.fit(X_train,y_train)
knn_time = time.time() -knn_time

from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2,random_state=42)
tree_time = time.time() 
tree_clf.fit(X_train,y_train)
tree_time = time.time() -tree_time

from sklearn.svm import SVC
svm_clf = SVC(gamma='auto',C=2,random_state=42,probability=True)
svc_time = time.time() 
svm_clf.fit(X_train,y_train)
svc_time = time.time() -svc_time 

In [8]:
from sklearn.metrics import accuracy_score
def getScore(model):
    y_score = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_score)
    return accuracy
print('SGD classifier Accuracy : {}'.format(getScore(sgd_clf)))
print('KNN classifier Accuracy : {}'.format(getScore(knn_clf)))
print('Decision Tree classifier Accuracy : {}'.format(getScore(tree_clf)))
print('SVM classifier Accuracy : {}'.format(getScore(svm_clf)))

SGD classifier Accuracy : 0.8473846153846154
KNN classifier Accuracy : 0.7655384615384615
Decision Tree classifier Accuracy : 0.7556923076923077
SVM classifier Accuracy : 0.8633846153846154


## Step 3. Train Adaboost classifier and Gradient boosting classifier

In [9]:
#Adaboost
from sklearn.ensemble import AdaBoostClassifier
ada_clf_sgd = AdaBoostClassifier(SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log'),
                            n_estimators=10,learning_rate=0.5, random_state=42)
ada_sgd_time = time.time() 
ada_clf_sgd.fit(X_train,y_train)
ada_sgd_time = time.time()-ada_sgd_time

In [10]:
ada_clf_knn = AdaBoostClassifier(KNeighborsClassifier(n_neighbors=2),
                            n_estimators=10,learning_rate=0.5, random_state=42)
ada_knn_time = time.time() 
ada_clf_knn.fit(X_train,y_train)
ada_knn_time = time.time() - ada_knn_time

ValueError: KNeighborsClassifier doesn't support sample_weight.

In [11]:
ada_clf_tree = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2,random_state=42),
                            n_estimators=10,learning_rate=0.5, random_state=42)
ada_tree_time = time.time() 
ada_clf_tree.fit(X_train,y_train)
ada_tree_time = time.time() -ada_tree_time

- Compatible base learner's fit method needs to support sample_weight : AdaBoostClassifier, BernoulliNB, DecisionTreeClassifier, ExtraTreeClassifier, ExtraTreesClassifier, MultinomialNB, NuSVC, Perceptron, RandomForestClassifier, RidgeClassifierCV, SGDClassifier, SVC.

In [12]:
ada_clf_svc = AdaBoostClassifier(SVC(gamma='auto',C=2,random_state=42,probability=True),
                            n_estimators=10,learning_rate=0.5, random_state=42)
ada_svc_time = time.time() 
ada_clf_svc.fit(X_train,y_train)
ada_svc_time = time.time() -ada_svc_time

In [13]:
#GradientBoosting --> depth=2
grd_sgd_time = time.time() 
grd_clf_sgd1 = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
grd_clf_sgd1.fit(X_train,y_train)
y2 = y_train - grd_clf_sgd1.predict(X_train)
grd_clf_sgd2 = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
grd_clf_sgd2.fit(X_train,y2)
y3 = y2 - grd_clf_sgd2.predict(X_train)
grd_clf_sgd = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
grd_clf_sgd.fit(X_train,y3)
grd_sgd_time = time.time() -grd_sgd_time

In [14]:
grd_knn_time=time.time()
grd_clf_knn1 = KNeighborsClassifier(n_neighbors=2)
grd_clf_knn1.fit(X_train,y_train)
y2 = y_train - grd_clf_knn1.predict(X_train)
grd_clf_knn2 = KNeighborsClassifier(n_neighbors=2)
grd_clf_knn2.fit(X_train,y2)
y3 = y2 - grd_clf_knn2.predict(X_train)
grd_clf_knn = KNeighborsClassifier(n_neighbors=2)
grd_clf_knn.fit(X_train,y3)
grd_knn_time=time.time()-grd_knn_time

In [15]:
#GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
grd_clf_tree = GradientBoostingClassifier(max_depth=2,
                                     n_estimators=10, learning_rate=0.5)
grd_tree_time=time.time()
grd_clf_tree.fit(X_train,y_train)
grd_tree_time=time.time()-grd_tree_time

In [16]:
grd_svc_time=time.time()
grd_clf_svc1 = SVC(gamma='auto',C=2,random_state=42,probability=True)
grd_clf_svc1.fit(X_train,y_train)
y2 = y_train - grd_clf_svc1.predict(X_train)
grd_clf_svc2 = SVC(gamma='auto',C=2,random_state=42,probability=True)
grd_clf_svc2.fit(X_train,y2)
y3 = y2 - grd_clf_svc2.predict(X_train)
grd_clf_svc = SVC(gamma='auto',C=2,random_state=42,probability=True)
grd_clf_svc.fit(X_train,y3)
grd_svc_time=time.time()-grd_svc_time

## Step 4. Compare Step3 model accuracy

In [17]:
print('Adaboost classifier - SGD Accuracy : {}'.format(getScore(ada_clf_sgd)))
#print('Adaboost classifier - KNN Accuracy : {}'.format(getScore(ada_clf_knn)))
print('Adaboost classifier - Decision Tree Accuracy : {}'.format(getScore(ada_clf_tree)))
print('Adaboost classifier - SVC Accuracy : {}'.format(getScore(ada_clf_svc)))

Adaboost classifier - SGD Accuracy : 0.8116923076923077
Adaboost classifier - Decision Tree Accuracy : 0.8633846153846154
Adaboost classifier - SVC Accuracy : 0.7409230769230769


In [18]:
print('Gradient boosting classifier - SGD Accuracy Accuracy : {}'.format(getScore(grd_clf_sgd)))
print('Gradient boosting classifier - KNN Accuracy Accuracy : {}'.format(getScore(grd_clf_knn)))
print('Gradient boosting classifier - Decision Tree Accuracy : {}'.format(getScore(grd_clf_tree)))
print('Gradient boosting classifier - SVC Accuracy Accuracy : {}'.format(getScore(grd_clf_svc)))

Gradient boosting classifier - SGD Accuracy Accuracy : 0.5181538461538462
Gradient boosting classifier - KNN Accuracy Accuracy : 0.5187692307692308
Gradient boosting classifier - Decision Tree Accuracy : 0.8307692307692308
Gradient boosting classifier - SVC Accuracy Accuracy : 0.5187692307692308


## Step 5. Find precision and Recall value

In [19]:
from sklearn.metrics import precision_score,recall_score
def getPR(classifier):
    y_score = classifier.predict(X_test)
    precision = precision_score(y_test, y_score,average='micro')
    recall =  recall_score(y_test, y_score,average='micro')
    print('precision score: {}'.format(precision))
    print('recall score: {}'.format(recall))
#weak model
print('<knn_clf>')
getPR(knn_clf)
print('<tree_clf>')
getPR(tree_clf)
#Bagging and Pasting
print('<ada_clf_sgd>')
getPR(ada_clf_sgd)
# print('<ada_clf_knn>')
# getPR(ada_clf_knn)
print('<ada_clf_tree>')
getPR(ada_clf_tree)
print('<ada_clf_svc>')
getPR(ada_clf_svc)
print('<grd_clf_sgd>')
getPR(grd_clf_sgd)
print('<grd_clf_knn>')
getPR(grd_clf_knn)
print('<grd_clf_tree>')
getPR(grd_clf_tree)
print('<grd_clf_svc>')
getPR(grd_clf_svc)

<knn_clf>
precision score: 0.7655384615384615
recall score: 0.7655384615384615
<tree_clf>
precision score: 0.7556923076923077
recall score: 0.7556923076923077
<ada_clf_sgd>
precision score: 0.8116923076923077
recall score: 0.8116923076923077
<ada_clf_tree>
precision score: 0.8633846153846154
recall score: 0.8633846153846154
<ada_clf_svc>
precision score: 0.7409230769230769
recall score: 0.7409230769230769
<grd_clf_sgd>
precision score: 0.5181538461538462
recall score: 0.5181538461538462
<grd_clf_knn>
precision score: 0.5187692307692308
recall score: 0.5187692307692308
<grd_clf_tree>
precision score: 0.8307692307692308
recall score: 0.8307692307692308
<grd_clf_svc>
precision score: 0.5187692307692308
recall score: 0.5187692307692308


## Step 6. Compare the results with Bagging classifier's accuracy

In [20]:

from sklearn.ensemble import BaggingClassifier
#Bagging classifier - SGD
bag_clf_sgd = BaggingClassifier(SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log'),
                                n_estimators=10,max_samples=100,
                                bootstrap=True ,n_jobs=-1 , random_state=42)
bag_sgd_time=time.time()
bag_clf_sgd.fit(X_train,y_train)
bag_sgd_time=time.time()-bag_sgd_time
#Bagging classifier - KNeighborsClassifier
bag_clf_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=2),
                                n_estimators=10,max_samples=100,
                                bootstrap=True ,n_jobs=-1 , random_state=42)
bag_knn_time=time.time()
bag_clf_knn.fit(X_train,y_train)
bag_knn_time=time.time()-bag_knn_time
#Bagging classifier - DecisionTreeClassifier
bag_clf_tree = BaggingClassifier(DecisionTreeClassifier(max_depth=2,random_state=42),
                                n_estimators=10,max_samples=100,
                                bootstrap=True ,n_jobs=-1 , random_state=42)
bag_tree_time=time.time()
bag_clf_tree.fit(X_train,y_train)
bag_tree_time=time.time()-bag_tree_time
#Bagging classifier - SVC
bag_clf_svc = BaggingClassifier(SVC(gamma='auto',C=2,random_state=42,probability=True),
                                n_estimators=10,max_samples=100,
                                bootstrap=True ,n_jobs=-1 , random_state=42)
bag_svc_time=time.time()
bag_clf_svc.fit(X_train,y_train)
bag_svc_time=time.time()-bag_svc_time

In [21]:
#Pasting classifier - SGD
pas_clf_sgd = BaggingClassifier(SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log'),
                                n_estimators=10,max_samples=100,
                                bootstrap=False ,n_jobs=-1 , random_state=42)
pas_sgd_time=time.time()
pas_clf_sgd.fit(X_train,y_train)
pas_sgd_time=time.time()-pas_sgd_time
#Pasting classifier - KNeighborsClassifier
pas_clf_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=2),
                                n_estimators=10,max_samples=100,
                                bootstrap=False ,n_jobs=-1 , random_state=42)
pas_knn_time=time.time()
pas_clf_knn.fit(X_train,y_train)
pas_knn_time=time.time()-pas_knn_time
#Pasting classifier - DecisionTreeClassifier
pas_clf_tree = BaggingClassifier(DecisionTreeClassifier(max_depth=2,random_state=42),
                                n_estimators=10,max_samples=100,
                                bootstrap=False ,n_jobs=-1 , random_state=42)
pas_tree_time=time.time()
pas_clf_tree.fit(X_train,y_train)
pas_tree_time=time.time()-pas_tree_time
#Pasting classifier - SVC
pas_clf_svc = BaggingClassifier(SVC(gamma='auto',C=2,random_state=42,probability=True),
                                n_estimators=10,max_samples=100,
                                bootstrap=False ,n_jobs=-1 , random_state=42)
pas_svc_time=time.time()
pas_clf_svc.fit(X_train,y_train)
pas_svc_time=time.time()-pas_svc_time

In [22]:
print('Bagging classifier - SGD classifier Accuracy : {}'.format(getScore(bag_clf_sgd)))
print('Bagging classifier - knn classifier Accuracy : {}'.format(getScore(bag_clf_knn)))
print('Bagging classifier - Decision Tree classifier Accuracy : {}'.format(getScore(bag_clf_tree)))
print('Bagging classifier - SVC classifier Accuracy : {}'.format(getScore(bag_clf_svc)))

Bagging classifier - SGD classifier Accuracy : 0.7692307692307693
Bagging classifier - knn classifier Accuracy : 0.788923076923077
Bagging classifier - Decision Tree classifier Accuracy : 0.7581538461538462
Bagging classifier - SVC classifier Accuracy : 0.824


In [23]:
print('Pasting classifier - SGD classifier Accuracy : {}'.format(getScore(pas_clf_sgd)))
print('Pasting classifier - knn classifier Accuracy : {}'.format(getScore(pas_clf_knn)))
print('Pasting classifier - Decision Tree classifier Accuracy : {}'.format(getScore(pas_clf_tree)))
print('Pasting classifier - SVC classifier Accuracy : {}'.format(getScore(pas_clf_svc)))

Pasting classifier - SGD classifier Accuracy : 0.824
Pasting classifier - knn classifier Accuracy : 0.84
Pasting classifier - Decision Tree classifier Accuracy : 0.7913846153846154
Pasting classifier - SVC classifier Accuracy : 0.8381538461538461


In [24]:
#soft voting
from sklearn.ensemble import VotingClassifier
soft_clf = VotingClassifier(estimators=[('sgd',SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')),
                                        ('knn',KNeighborsClassifier(n_neighbors=2)),
                                        ('tree',DecisionTreeClassifier(max_depth=2,random_state=42)),
                                        ('svc',SVC(gamma='auto',C=2,random_state=42,probability=True))
                                        ],
                           voting='soft')
#hard voting
hard_clf = VotingClassifier(estimators=[('sgd',SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')),
                                        ('knn',KNeighborsClassifier(n_neighbors=2)),
                                        ('tree',DecisionTreeClassifier(max_depth=2,random_state=42)),
                                        ('svc',SVC(gamma='auto',C=2,random_state=42,probability=True))
                                        ],
                           voting='hard')
soft_time=time.time()
soft_clf.fit(X_train,y_train)
soft_time=time.time()-soft_time
hard_time=time.time()
hard_clf.fit(X_train,y_train)
hard_time=time.time()-hard_time

In [25]:
print('Soft Voting classifier Accuracy : {}'.format(getScore(soft_clf)))
print('Hard Voting classifier Accuracy : {}'.format(getScore(hard_clf)))

Soft Voting classifier Accuracy : 0.8615384615384616
Hard Voting classifier Accuracy : 0.8633846153846154


- trainig time compare

In [26]:
print('<Training Time Compare>')
print('1. Weak model')
print('SGD Classifier : {}'.format(sgd_time))
print('knn Classifier : {}'.format(knn_time))
print('tree Classifier : {}'.format(tree_time))
print('SVC Classifier : {}'.format(svc_time))
print('2. AdaBoost')
print('SGD Classifier : {}'.format(ada_sgd_time))
#print('AdaBoost knn Classifier : {}'.format(ada_sgd_time))
print('tree Classifier : {}'.format(ada_tree_time))
print('svc Classifier : {}'.format(ada_svc_time))
print('3. Gradient Boost')
print('SGD Classifier : {}'.format(grd_sgd_time))
print('knn Classifier : {}'.format(grd_knn_time))
print('tree Classifier : {}'.format(grd_tree_time))
print('SVC Classifier : {}'.format(grd_svc_time))
print('4. Bagging')
print('SGD Classifier : {}'.format(bag_sgd_time))
print('knn Classifier : {}'.format(bag_knn_time))
print('tree Classifier : {}'.format(bag_tree_time))
print('SVC Classifier : {}'.format(bag_svc_time))
print('5. Pasting')
print('SGD Classifier : {}'.format(pas_sgd_time))
print('knn Classifier : {}'.format(pas_knn_time))
print('tree Classifier : {}'.format(pas_tree_time))
print('SVC Classifier : {}'.format(pas_svc_time))
print('6. Voting')
print('Soft Voting Classifier : {}'.format(soft_time))
print('Hard Voting Classifier : {}'.format(hard_time))

<Training Time Compare>
1. Weak model
SGD Classifier : 0.00599980354309082
knn Classifier : 0.023000001907348633
tree Classifier : 0.002000093460083008
SVC Classifier : 3.2529244422912598
2. AdaBoost
SGD Classifier : 0.1009976863861084
tree Classifier : 0.03999900817871094
svc Classifier : 111.76341009140015
3. Gradient Boost
SGD Classifier : 0.14800095558166504
knn Classifier : 1.182974100112915
tree Classifier : 0.031000137329101562
SVC Classifier : 8.222809076309204
4. Bagging
SGD Classifier : 1.2769708633422852
knn Classifier : 0.039998531341552734
tree Classifier : 0.026998281478881836
SVC Classifier : 0.030999422073364258
5. Pasting
SGD Classifier : 0.03999948501586914
knn Classifier : 0.019999265670776367
tree Classifier : 0.02499985694885254
SVC Classifier : 0.030998706817626953
6. Voting
Soft Voting Classifier : 3.126927375793457
Hard Voting Classifier : 3.1179280281066895
