# Step 1. select real dataset

In [1]:
#dataset : Mushrooms Classification (https://www.kaggle.com/mirichoi0218/insurance/data)
import pandas as pd
mushrooms = pd.read_csv('./datasets/mushrooms/mushrooms.csv')
mushrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [2]:
mushrooms.shape

(8124, 23)

# Step 2.1 Do Preprocessing

In [3]:
#check missing data
mushrooms.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [4]:
#Do preprocessing ==> 1. make numerical attributes
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in mushrooms.columns:
    mushrooms[col] = le.fit_transform(mushrooms[col])
mushrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [5]:
#get data label
X=mushrooms.drop(['class'],axis=1).values
y=mushrooms['class'].values

In [6]:
#Do scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(X)



# Step 2.2 Make training set and test set

In [7]:
#training set & test set 나누기
from sklearn.model_selection import train_test_split
import numpy as np
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Step 3. Choose several weak models and soft and hard voting classifier using the weak models.

In [8]:
#weak model
import numpy as np
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=2)
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2,random_state=42)
from sklearn.svm import SVC
svm_clf = SVC(gamma='auto',C=2,random_state=42,probability=True)

In [9]:
#soft voting
from sklearn.ensemble import VotingClassifier
soft_clf = VotingClassifier(estimators=[('sgd',sgd_clf),('knn',knn_clf),('tree',tree_clf),('svc',svm_clf)],
                           voting='soft')
#hard voting
hard_clf = VotingClassifier(estimators=[('sgd',sgd_clf),('knn',knn_clf),('tree',tree_clf),('svc',svm_clf)],
                           voting='hard')

# Step 4. Find the highest accuracy about in step3 model

## Step 4.1 Get accuracy using grid search

In [10]:
#grid search (soft voting)
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'sgd__max_iter':[5,10],
     'knn__n_neighbors':[2,3],
     'tree__max_depth':[2,3,4],
     'svc__C':[2,3,4]}
  ]
grid = GridSearchCV(soft_clf,param_grid,cv=10,scoring='accuracy', return_train_score=True)
grid.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=VotingClassifier(estimators=[('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=N...=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'sgd__max_iter': [5, 10], 'knn__n_neighbors': [2, 3], 'tree__max_depth': [2, 3, 4], 'svc__C': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [11]:
cvres = grid.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print('##########################################################')
print('Best hyperparameter: {}'.format(grid.best_params_))

0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 2, 'tree__max_depth': 2}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 2, 'tree__max_depth': 3}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 2, 'tree__max_depth': 4}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 3, 'tree__max_depth': 2}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 3, 'tree__max_depth': 3}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 3, 'tree__max_depth': 4}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 4, 'tree__max_depth': 2}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 4, 'tree__max_depth': 3}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 4, 'tree__max_depth': 4}
0.9996922603477458 {'knn__n_neighbors': 2, 'sgd__max_iter': 10, 'svc__C': 2, 'tree__max_depth': 2}
0.9996922603477458 

In [13]:
#grid search (hard voting)
param_grid_2 = [
    {'sgd__max_iter':[5,10],
     'knn__n_neighbors':[2,3],
     'tree__max_depth':[2,3,4],
     'svc__C':[2,3,4]}
  ]
grid_2 = GridSearchCV(hard_clf,param_grid_2,cv=10,scoring='accuracy', return_train_score=True)
grid_2.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=VotingClassifier(estimators=[('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=N...=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'sgd__max_iter': [5, 10], 'knn__n_neighbors': [2, 3], 'tree__max_depth': [2, 3, 4], 'svc__C': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [14]:
cvres_2 = grid_2.cv_results_
for mean_score, params in zip(cvres_2["mean_test_score"], cvres_2["params"]):
    print(mean_score, params)
print('##########################################################')
print('Best hyperparameter: {}'.format(grid_2.best_params_))

0.9883058932143407 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 2, 'tree__max_depth': 2}
0.9946145560855516 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 2, 'tree__max_depth': 3}
0.9949222957378058 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 2, 'tree__max_depth': 4}
0.9883058932143407 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 3, 'tree__max_depth': 2}
0.9946145560855516 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 3, 'tree__max_depth': 3}
0.9949222957378058 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 3, 'tree__max_depth': 4}
0.9883058932143407 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 4, 'tree__max_depth': 2}
0.9946145560855516 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 4, 'tree__max_depth': 3}
0.9949222957378058 {'knn__n_neighbors': 2, 'sgd__max_iter': 5, 'svc__C': 4, 'tree__max_depth': 4}
0.9867671949530697 {'knn__n_neighbors': 2, 'sgd__max_iter': 10, 'svc__C': 2, 'tree__max_depth': 2}
0.9941529466071704 

## Step 4.2 Use Randomize Search

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
        'sgd__max_iter': randint(low=1, high=100),
        'knn__n_neighbors': randint(low=1, high=10),
        'tree__max_depth': randint(low=1,high=15),
        'svc__C':randint(low=1, high=15)
    }
rnd_search = RandomizedSearchCV(soft_clf, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)
rnd_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=VotingClassifier(estimators=[('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=N...=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'sgd__max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAF9518>, 'knn__n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAF9C18>, 'tree__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAF9A20>, 'svc__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAEE978>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True

In [16]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

1.0 {'knn__n_neighbors': 7, 'sgd__max_iter': 52, 'svc__C': 13, 'tree__max_depth': 11}
1.0 {'knn__n_neighbors': 8, 'sgd__max_iter': 61, 'svc__C': 5, 'tree__max_depth': 7}
1.0 {'knn__n_neighbors': 3, 'sgd__max_iter': 87, 'svc__C': 11, 'tree__max_depth': 11}
0.9998461301738729 {'knn__n_neighbors': 8, 'sgd__max_iter': 24, 'svc__C': 3, 'tree__max_depth': 6}
1.0 {'knn__n_neighbors': 5, 'sgd__max_iter': 2, 'svc__C': 8, 'tree__max_depth': 12}
0.9996922603477458 {'knn__n_neighbors': 6, 'sgd__max_iter': 2, 'svc__C': 12, 'tree__max_depth': 5}
1.0 {'knn__n_neighbors': 1, 'sgd__max_iter': 76, 'svc__C': 10, 'tree__max_depth': 6}
1.0 {'knn__n_neighbors': 9, 'sgd__max_iter': 49, 'svc__C': 11, 'tree__max_depth': 11}
1.0 {'knn__n_neighbors': 3, 'sgd__max_iter': 55, 'svc__C': 4, 'tree__max_depth': 9}
1.0 {'knn__n_neighbors': 3, 'sgd__max_iter': 51, 'svc__C': 7, 'tree__max_depth': 5}


In [17]:
param_distribs_2 = {
        'sgd__max_iter': randint(low=1, high=100),
        'knn__n_neighbors': randint(low=1, high=10),
        'tree__max_depth': randint(low=1,high=15),
        'svc__C':randint(low=1, high=15)
    }
rnd_search_2 = RandomizedSearchCV(hard_clf, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)
rnd_search_2.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=VotingClassifier(estimators=[('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=N...=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'sgd__max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAF9518>, 'knn__n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAF9C18>, 'tree__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAF9A20>, 'svc__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C4DAEE978>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True

In [18]:
cvres_2 = rnd_search_2.cv_results_
for mean_score, params in zip(cvres_2["mean_test_score"], cvres_2["params"]):
    print(mean_score, params)

0.9998461301738729 {'knn__n_neighbors': 7, 'sgd__max_iter': 52, 'svc__C': 13, 'tree__max_depth': 11}
0.9990767810432374 {'knn__n_neighbors': 8, 'sgd__max_iter': 61, 'svc__C': 5, 'tree__max_depth': 7}
1.0 {'knn__n_neighbors': 3, 'sgd__max_iter': 87, 'svc__C': 11, 'tree__max_depth': 11}
0.9976919526080935 {'knn__n_neighbors': 8, 'sgd__max_iter': 24, 'svc__C': 3, 'tree__max_depth': 6}
1.0 {'knn__n_neighbors': 5, 'sgd__max_iter': 2, 'svc__C': 8, 'tree__max_depth': 12}
0.996768733651331 {'knn__n_neighbors': 6, 'sgd__max_iter': 2, 'svc__C': 12, 'tree__max_depth': 5}
0.9987690413909832 {'knn__n_neighbors': 1, 'sgd__max_iter': 76, 'svc__C': 10, 'tree__max_depth': 6}
0.9992306508693645 {'knn__n_neighbors': 9, 'sgd__max_iter': 49, 'svc__C': 11, 'tree__max_depth': 11}
1.0 {'knn__n_neighbors': 3, 'sgd__max_iter': 55, 'svc__C': 4, 'tree__max_depth': 9}
0.9946145560855516 {'knn__n_neighbors': 3, 'sgd__max_iter': 51, 'svc__C': 7, 'tree__max_depth': 5}


# Step 5. Find precision and Recall value

## Step 5.1 Calculate Precision and Recall about whole classification model

In [19]:
#weak model
sgd_clf.fit(X_train,y_train)
knn_clf.fit(X_train,y_train)
tree_clf.fit(X_train,y_train)
svm_clf.fit(X_train,y_train)
#voting
soft_clf.fit(X_train,y_train)
hard_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       p...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [20]:
#example test
X_test[13]

array([-0.8403434 ,  0.14012794, -0.59107461, -0.84322964,  0.40656203,
        0.16289645,  2.27861212, -0.66903831, -0.22899776,  0.87351064,
        1.78146019, -2.53465204,  0.58638466,  0.62244139,  0.63199138,
        0.        ,  0.14203663, -0.25613174, -1.27221574, -0.67019486,
       -2.91054623, -0.29572966])

In [21]:
y_test[13]

0

In [22]:
#soft_voting prediction
y_pred = soft_clf.predict([X_test[13]])
print('Soft voting clf prediction : {}'.format(y_pred))
#hard_voting prediction
y_pred2 = hard_clf.predict([X_test[13]])
print('Hard voting clf prediction : {}'.format(y_pred2))

Soft voting clf prediction : [0]
Hard voting clf prediction : [0]


In [23]:
from sklearn.metrics import precision_score,recall_score
def getPR(classifier,X_test,y_test):
    y_score = classifier.predict(X_test)
    precision = precision_score(y_test, y_score,average='micro')
    recall =  recall_score(y_test, y_score,average='micro')
    print('precision score: {}'.format(precision))
    print('recall score: {}'.format(recall))
#weak model
getPR(sgd_clf,X_test,y_test)
getPR(knn_clf,X_test,y_test)
getPR(tree_clf,X_test,y_test)
getPR(svm_clf,X_test,y_test)

precision score: 0.9101538461538462
recall score: 0.9101538461538462
precision score: 1.0
recall score: 1.0
precision score: 0.9076923076923077
recall score: 0.9076923076923077
precision score: 1.0
recall score: 1.0
