In [8]:
# EDA and data handling
import numpy as np
import pandas as pd
import pickle

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

## Get the data

In [11]:
# read in the iris dataset
df = pd.read_pickle('resources/iris.pkl')
df.head()

Unnamed: 0,sl,sw,pl,pw,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [12]:
# how many do we have of each species?


In [13]:
# describe the data - no need for standardization!


## a simple KNN model (with only 2 predictors)
While in practice a 2-predictor model is typically too simple (i.e., high variance), for the purposes of building a visualization it's simpler to map a scatterplot when there are only two dimensions to deal with.

In [12]:
# establish the predictors and the target
X = df[['sl','pl']]
y = df['species']

In [13]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

In [14]:
# instantiate the classifier
mymodel = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean')

In [15]:
# fit on the training dataset
mymodel.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [16]:
# predict on the testing dataset
y_preds = mymodel.predict(X_test)

In [17]:
# evaluate the accuracy
metrics.accuracy_score(y_test, y_preds)

0.9333333333333333

In [18]:
# examine the confusion matrix
metrics.confusion_matrix(y_test, y_preds)

array([[18,  0,  0],
       [ 0,  9,  0],
       [ 0,  3, 15]])

## Predict for a new observation

In [19]:
# Create a fake new data point
new_obs=[[4.9, 2.7]]

In [20]:
# predict for our new observation
mymodel.predict(new_obs)

array([1])

In [21]:
# What are the indices of the 5 neighbors nearest to that new observation?
mymodel.kneighbors(new_obs)

(array([[0.36055513, 0.6       , 0.60827625, 0.80622577, 1.0198039 ]]),
 array([[37, 70, 18,  6, 98]]))

In [22]:
# Create multiple KNN models and pickle for use in the plotly dash app.
for k in [5, 10, 15, 20, 25]:
    mymodel = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='euclidean')
    mymodel.fit(X_train, y_train)
    y_preds = mymodel.predict(X_test)
    file = open(f'resources/model_k{k}.pkl', 'wb')
    pickle.dump(mymodel, file)
    file.close()