In [27]:
import pandas as pd
import numpy as np

from sklearn import preprocessing


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

In [28]:
df = pd.read_csv('../mushrooms.csv')


# One Hot Encoding - with 'odor' feature

In [None]:
x_o = pd.get_dummies(df.drop(['habitat','population','spore-print-color','stalk-root','stalk-surface-above-ring',
                 'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type',
                'class','gill-attachment','gill-spacing','gill-size','odor'],axis=1)
a = pd.get_dummies(df['class'])
y_o = a.drop(['p'],axis=1).to_numpy().ravel()
print(y)

In [48]:
data = df.drop(['habitat','population','spore-print-color','stalk-root','stalk-surface-above-ring',
                 'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type',
                'class','gill-attachment','gill-spacing','gill-size','odor','ring-type','ring-number','stalk-shape','gill-color'],axis=1)

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 5 columns):
cap-shape      8124 non-null object
cap-surface    8124 non-null object
cap-color      8124 non-null object
bruises        8124 non-null object
veil-color     8124 non-null object
dtypes: object(5)
memory usage: 317.5+ KB


# Split Train - Test - with 'odor' feature

In [23]:
x_train_o, x_test_o, y_train_o, y_test_o = train_test_split(x_o, y_o, test_size=0.33, random_state=0)

# Looking for the best parameters - with 'odor' feature

In [24]:
params = {"n_neighbors": range(2, 101),
          "p": range(1,2),
          "weights": ["uniform", "distance"]}

knn = KNeighborsClassifier()

knn_classifier_search = GridSearchCV(knn, params, cv=5,n_jobs=4)
knn_classifier_search.fit(x_train_o, y_train_o)
knn_classifier_search.best_params_

KeyboardInterrupt: 

In [None]:
best_knn_classifier = \
    KNeighborsClassifier(n_neighbors=knn_classifier_search.best_params_["n_neighbors"],
                           p=knn_classifier_search.best_params_["p"],
                           weights=knn_classifier_search.best_params_["weights"])
best_knn_classifier.fit(x_train_o, y_train_o)
print(best_knn_classifier.score(x_test_o, y_test_o))
y_pred_o = best_knn_classifier.predict(x_test_o)
print(y_pred_o)
print(accuracy_score(y_test_o,y_pred_o))

In [19]:
best_score_r = 0.0
best_params_r = {}

for i in range(1, 51):
    r = i / 20
    for w in ["uniform", "distance"]:
        for p in [1, 2]:
            classifier = RadiusNeighborsClassifier(radius=r, p=p, weights=w)
            try:
                scores = cross_val_score(classifier, x_train_o, y_train_o, cv=5)
                score = scores.mean()
            except ValueError:
                score = 0.0
            if score > best_score_r:
                best_score_r = score
                best_params_r["radius"] = r
                best_params_r["weights"] = w
                best_params_r["p"] = p
print(best_score_r)
print(best_params_r)

0.9990815575541511
{'radius': 1.45, 'weights': 'uniform', 'p': 2}


In [20]:
classifier_r = RadiusNeighborsClassifier(radius=best_params_r["radius"],
                                       p=best_params_r["p"],
                                       weights=best_params_r["weights"])
classifier_r.fit(x_train_o, y_train_o)
y_predr_o = classifier_r.predict(x_test_o)
print(y_predr_o)
print(accuracy_score(y_test_o, y_predr_o))

[0 1 1 ... 0 1 0]
0.9992540096978739


# One Hot Encoding - without 'odor' feature

In [44]:
x = pd.get_dummies(df.drop(['habitat','population','spore-print-color','stalk-root','stalk-surface-above-ring',
                 'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type',
                'class','gill-attachment','gill-spacing','gill-size','odor','ring-type','ring-number','stalk-shape','gill-color'],axis=1))
a = pd.get_dummies(df['class'])
y = a.drop(['p'],axis=1).to_numpy().ravel()
x

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,cap-color_r,cap-color_u,cap-color_w,cap-color_y,bruises_f,bruises_t,veil-color_n,veil-color_o,veil-color_w,veil-color_y
0,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
3,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
8120,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
8121,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
8122,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0


# Split Train - Test - without 'odor' feature

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0)

# Looking for the best parameters - without 'odor' feature

In [54]:
%%time

params = {"n_neighbors": range(2, 101),
          "p": range(1,2),
          "weights": ["uniform", "distance"]}

knn = KNeighborsClassifier()

knn_classifier_search = GridSearchCV(knn, params, cv=5,n_jobs=4)
knn_classifier_search.fit(x_train, y_train)
knn_classifier_search.best_params_

CPU times: user 3.3 s, sys: 180 ms, total: 3.48 s
Wall time: 2min 14s


{'n_neighbors': 55, 'p': 1, 'weights': 'distance'}

In [55]:
%%time

best_knn_classifier = \
    KNeighborsClassifier(n_neighbors=knn_classifier_search.best_params_["n_neighbors"],
                           p=knn_classifier_search.best_params_["p"],
                           weights=knn_classifier_search.best_params_["weights"])
best_knn_classifier.fit(x_train, y_train)
print(best_knn_classifier.score(x_test, y_test))
y_pred = best_knn_classifier.predict(x_test)
print(y_pred)
print(accuracy_score(y_test,y_pred))

0.9157030958597538
[1 1 1 ... 0 1 0]
0.9157030958597538
CPU times: user 1.22 s, sys: 0 ns, total: 1.22 s
Wall time: 1.22 s


In [56]:
%%time

best_score_r = 0.0
best_params_r = {}

for i in range(1, 51):
    r = i / 20
    for w in ["uniform", "distance"]:
        for p in [1, 2]:
            classifier = RadiusNeighborsClassifier(radius=r, p=p, weights=w)
            try:
                scores = cross_val_score(classifier, x_train, y_train, cv=5)
                score = scores.mean()
            except ValueError:
                score = 0.0
            if score > best_score_r:
                best_score_r = score
                best_params_r["radius"] = r
                best_params_r["weights"] = w
                best_params_r["p"] = p
print(best_score_r)
print(best_params_r)

0.9053842232485281
{'radius': 1.45, 'weights': 'distance', 'p': 2}
CPU times: user 2min 55s, sys: 600 ms, total: 2min 55s
Wall time: 2min 55s


In [57]:
%%time

classifier_r = RadiusNeighborsClassifier(radius=best_params_r["radius"],
                                       p=best_params_r["p"],
                                       weights=best_params_r["weights"])
classifier_r.fit(x_train, y_train)
y_predr = classifier_r.predict(x_test)
print(y_predr)
print(accuracy_score(y_test, y_predr))

[1 1 1 ... 0 1 0]
0.917195076464006
CPU times: user 1.19 s, sys: 3.99 ms, total: 1.2 s
Wall time: 1.2 s
