In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [2]:
# Load the data
data = pd.read_csv("../data/mushrooms.csv")
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [3]:
# Describe the data to see columns' information
data.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [4]:
# All columns are categorical. See the unique values
uniques = pd.DataFrame.from_dict({i: [list(data[i].unique())] for i in data.columns})
uniques.T

Unnamed: 0,0
class,"[p, e]"
cap-shape,"[x, b, s, f, k, c]"
cap-surface,"[s, y, f, g]"
cap-color,"[n, y, w, g, e, p, b, u, c, r]"
bruises,"[t, f]"
odor,"[p, a, l, n, f, c, y, s, m]"
gill-attachment,"[f, a]"
gill-spacing,"[c, w]"
gill-size,"[n, b]"
gill-color,"[k, n, g, p, w, h, u, e, b, r, y, o]"


In [5]:
# There seems to be some missing values for "stalk-root" labelled as "?". Remove them
filtered_data = data[data["stalk-root"] != "?"]
filtered_data_uniques = pd.DataFrame.from_dict({i: [list(filtered_data[i].unique())] for i in filtered_data.columns})
filtered_data_uniques.T

Unnamed: 0,0
class,"[p, e]"
cap-shape,"[x, b, s, f, k, c]"
cap-surface,"[s, y, f, g]"
cap-color,"[n, y, w, g, e, p, b, c]"
bruises,"[t, f]"
odor,"[p, a, l, n, f, c, m]"
gill-attachment,"[f, a]"
gill-spacing,"[c, w]"
gill-size,"[n, b]"
gill-color,"[k, n, g, p, w, h, u, r, y]"


In [6]:
X = filtered_data.drop(["class"], axis=1)
y = filtered_data["class"]

encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

In [8]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [9]:
# Try with different k values too
k = 3333
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6244464127546502


Therefore, more neighbors doesn't always mean better accuracy.