In [2]:
import pandas as pd
import numpy as np
from lib import *

#read data
df = pd.read_csv('agaricus-lepiota.data', header=None, sep=',', engine='python')

df.columns = ['label','cap-shape', 'cap-surface', 'cap-color', 'bruises', 
             'odor', 'gill-attachment', 'gill-spacing', 
             'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 
              'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 
              'stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type',
              'spore-print-color','population','habitat']

display(df.head(5))

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
###data preproccessing

#Handling Categorical Data
from sklearn.preprocessing import LabelEncoder
# encode label first
label_le = LabelEncoder()
df['label'] = label_le.fit_transform(df['label'].values)

# encode categorical features
catego_features = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 
             'odor', 'gill-attachment', 'gill-spacing', 
             'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 
              'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 
              'stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type',
              'spore-print-color','population','habitat']
catego_le = LabelEncoder()

num_values = []
for i in catego_features:
    df[i] = catego_le.fit_transform(df[i].values)
    classes_list = catego_le.classes_.tolist()
    # store the total number of values
    num_values.append(len(classes_list))
    
    # replace '?' with 'NaN'
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)
        
display(df.head(5))      
#count the number of missing values per column

display(df.isnull().sum())

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


label                          0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [4]:
#model using pipeline
#Imputer and OneHotEncoder for SVM and KNN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# find the index of the categorical feature
catego_features_idx = []
for str in catego_features:
    catego_features_idx.append(df.columns.tolist().index(str)-1)  #label is in first column

In [5]:
#sample 2000 rows
from sklearn.model_selection import train_test_split
df_small = df.sample(n=2000, random_state=0)

X = df_small.drop('label', 1).values
y = df_small['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [6]:
pipe_svm = Pipeline([('imr', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
                     ('ohe', OneHotEncoder(categorical_features = catego_features_idx, 
                                           n_values = num_values, sparse=False)),
                     ('scl', StandardScaler()),
                     ('clf', SVC(random_state=0))])

param_gamma = [0.0001, 0.001, 0.01, 0.1, 1.0]
param_C = [0.1, 1.0, 10.0, 100.0]

# here you can set parameter for different steps 
# by adding two underlines (__) between step name and parameter name
param_grid = [{'clf__C': param_C, 
               'clf__kernel': ['linear']},
              {'clf__C': param_C, 
               'clf__gamma': param_gamma, 
               'clf__kernel': ['rbf']}]

# set pipe_svm as the estimator
gs = GridSearchCV(estimator=pipe_svm, 
                  param_grid=param_grid, 
                  scoring='accuracy')

gs = gs.fit(X_train, y_train)
print('[SVC: grid search]')
print('Validation accuracy: %.3f' % gs.best_score_)
print(gs.best_params_)

clf = gs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_test, y_test))

[SVC: grid search]
Validation accuracy: 1.000
{'clf__C': 10.0, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
Test accuracy: 1.000


In [7]:
pipe_knn = Pipeline([('imr', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
                     ('ohe', OneHotEncoder(categorical_features=catego_features_idx, 
                                           n_values=num_values, sparse=False)),
                     ('scl', StandardScaler()),
                     ('clf', KNeighborsClassifier(n_neighbors=10, p=2, metric='minkowski'))])

pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

[KNN]
Misclassified samples: 1
Accuracy: 0.9975


一開始將資料讀出來做preproccessing, 由於全部的features都是英文的分類, 先做了encoding, 其中有一個column有很多項是沒資料的, 將他們轉成NaN, 接著丟進SVM和KNN兩種pipeline, 當中我們做imputation和onehotencoding, 最後就產生出結果. 最好的結果為SVM使用rbf為kernel, c=10, gamma=0.001, 得到的accuracy為100％, 另外KNN的accuracy也有99.75％

In [8]:
plot_decision_regions(X_test, y_test,
                      classifier=pipe_knn, 
                      test_idx=range(y_train.size, 
                                     y_train.size + y_test.size))

ValueError: X has 2 features per sample, expected 22