In [26]:
# inline plotting instead of popping out
%matplotlib inline

# python 3.6.8
import os, itertools, csv

from IPython.display import Image
from IPython.display import display

# numpy  1.19.5
import numpy as np

# pandas  0.25.3
import pandas as pd

# scikit-learn  0.22
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_moons
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# matplotlib  3.1.2
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
plt = matplotlib.pyplot

# load utility classes/functions that has been taught in previous labs
# e.g., plot_decision_regions()
from lib import *

# Make output directory
if not os.path.exists("output/") : os.mkdir("output/")

In [27]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes','cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']
df.columns = column_name
df.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [39]:
X = df.drop(columns=['classes'], axis=1)
y = df['classes']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [40]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [41]:
transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# x_train = transformer.fit_transform(x_train)
# x_test = transformer.fit_transform(x_test)

preprocessor = ColumnTransformer(
    transformers=[('transformer', transformer, df.columns[1:])])


In [43]:
# # KNN
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', KNeighborsClassifier())
]).fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
print(accuracy)

1.0


In [44]:
# SVM
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', SVC(gamma='scale'))
]).fit(x_train, y_train)
acc = model.score(x_test, y_test)
print(accuracy)

1.0


兩個 Modle Acc都是1.0