# **Baseline Comparison:** k-Nearest Neighbor Algorithms

### **Part 1:** Load In and Prepare the Data

In [2]:
# Classification Examples

# Ignore warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Imports
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
colors = ['#E69F00', '#56B4E9', '#009E73', '#0072B2', '#D55E00']

# Load data
data = pd.read_csv('data/mushrooms.csv')

# Get data headers
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [3]:
clean_data = data.copy()

# Rename the outcome column to poisonous and non-poisonous
clean_data = clean_data.rename(columns={'class': 'poisonous'})
clean_data['poisonous'] = clean_data['poisonous'].replace({'p': 'poisonous', 'e': 'non-poisonous'})

# Remove any columns that have only one value type
print('How many different values do the columns have?')
for column in clean_data.columns:
    print(f'   {column}: {len(clean_data[column].unique())}')
clean_data = clean_data.drop(columns='veil-color')
print()

# Final input data
print(f'The final categories are:')
clean_data.info()

How many different values do the columns have?
   poisonous: 2
   cap-shape: 6
   cap-surface: 4
   cap-color: 10
   bruises: 2
   odor: 9
   gill-attachment: 2
   gill-spacing: 2
   gill-size: 2
   gill-color: 12
   stalk-shape: 2
   stalk-root: 5
   stalk-surface-above-ring: 4
   stalk-surface-below-ring: 4
   stalk-color-above-ring: 9
   stalk-color-below-ring: 9
   veil-type: 1
   veil-color: 4
   ring-number: 3
   ring-type: 5
   spore-print-color: 9
   population: 6
   habitat: 7

The final categories are:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   poisonous                 8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   obje

In [4]:
# Separate out classfiers
X = data.copy().drop(columns=['class'])
y = data.copy()['class']

# Create dummy variables for all columns in X
X = pd.get_dummies(X, drop_first=False)

# Generate training and testing set: 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### **Part 2:** Create a k-NN with Baseline Parameters

In [7]:
# Build k-NN baseline using standard parameters

# Form the model
knn_base = KNeighborsClassifier()

# Train each model
knn_base.fit(X_train, y_train)

# Predict the values
knn_base_pred = model.predict(X_test)

### **Part 3:** Create a k-NN with Optimized Parameters

In [15]:
# Source: https://medium.datadriveninvestor.com/k-nearest-neighbors-in-python-hyperparameters-tuning-716734bc557f

# List Hyperparameters that we want to tune.
leaf_size = list(range(1, 51, 5))
n_neighbors = list(range(2, 11))
p=[1, 2]

# Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

# Create new KNN object
knn_opt = KNeighborsClassifier(algorithm='ball_tree')

# Use grid search to find the ideal hyperparamters
grid_search = GridSearchCV(knn_opt, hyperparameters, cv=10, verbose=10)

# Fit the model
knn_opt_pred = grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits
[CV 1/10; 1/180] START leaf_size=1, n_neighbors=2, p=1..........................
[CV 1/10; 1/180] END leaf_size=1, n_neighbors=2, p=1;, score=1.000 total time=   0.3s
[CV 2/10; 1/180] START leaf_size=1, n_neighbors=2, p=1..........................
[CV 2/10; 1/180] END leaf_size=1, n_neighbors=2, p=1;, score=1.000 total time=   0.2s
[CV 3/10; 1/180] START leaf_size=1, n_neighbors=2, p=1..........................
[CV 3/10; 1/180] END leaf_size=1, n_neighbors=2, p=1;, score=1.000 total time=   0.2s
[CV 4/10; 1/180] START leaf_size=1, n_neighbors=2, p=1..........................
[CV 4/10; 1/180] END leaf_size=1, n_neighbors=2, p=1;, score=1.000 total time=   0.2s
[CV 5/10; 1/180] START leaf_size=1, n_neighbors=2, p=1..........................
[CV 5/10; 1/180] END leaf_size=1, n_neighbors=2, p=1;, score=1.000 total time=   0.2s
[CV 6/10; 1/180] START leaf_size=1, n_neighbors=2, p=1..........................
[CV 6/10; 1/180] EN