## Inspecting and setting up data

In [1]:
# EDA and data handling
import numpy as np
import pandas as pd
import pickle

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [3]:
# check folder
ls

app.py   iris-classification-starter.ipynb  README.md         [0m[38;5;33mresources[0m/
[38;5;33massets[0m/  Procfile                           requirements.txt  [38;5;33msolutions[0m/


## Get the data

In [4]:
# read in the iris dataset
# note: the last character is a lower case 'L' but the font display be be confusing
df = pd.read_pickle('resources/iris.pkl')
df.sample(10)

Unnamed: 0,sl,sw,pl,pw,species
99,5.7,2.8,4.1,1.3,1
52,6.9,3.1,4.9,1.5,1
24,4.8,3.4,1.9,0.2,0
94,5.6,2.7,4.2,1.3,1
107,7.3,2.9,6.3,1.8,2
95,5.7,3.0,4.2,1.2,1
148,6.2,3.4,5.4,2.3,2
32,5.2,4.1,1.5,0.1,0
22,4.6,3.6,1.0,0.2,0
55,5.7,2.8,4.5,1.3,1


In [6]:
# how many do we have of each species?
df['species'].value_counts()

2    50
1    50
0    50
Name: species, dtype: int64

In [7]:
# describe the data - no need for standardization!
df.describe()

Unnamed: 0,sl,sw,pl,pw,species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## a simple KNN model (with only 2 predictors)
While in practice a 2-predictor model is typically too simple (i.e., high variance), for the purposes of building a visualization it's simpler to map a scatterplot when there are only two dimensions to deal with.

In [8]:
# establish the predictors and the target
X = df[['sl', 'pl']]
y = df['species']

In [9]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [10]:
# instantiate the classifier
# creating a local instance of the sklearn class

mymodel = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean')


In [11]:
# fit on the training dataset
mymodel.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [12]:
# predict on the testing dataset
y_preds = mymodel.predict(X_test)

In [13]:
# evaluate the accuracy
metrics.accuracy_score(y_test, y_preds)

0.9777777777777777

In [14]:
# examine the confusion matrix
metrics.confusion_matrix(y_test, y_preds)

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  1, 12]])

## Predict for a new observation

In [15]:
# Create a fake new data point
new_obs=[[4.9, 2.7]]

In [16]:
# predict for our new observation
mymodel.predict(new_obs)

array([1])

In [17]:
# What are the indices of the 5 neighbors nearest to that new observation?
mymodel.kneighbors(new_obs)

(array([[0.36055513, 0.6       , 0.60827625, 0.80622577, 0.80622577]]),
 array([[45, 84, 34, 14, 16]]))

In [18]:
# Create multiple KNN models and pickle for use in the plotly dash app.
for k in [5,10,15,20,25]:
    mymodel = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='euclidean')
    mymodel.fit(X_train, y_train)
    y_pred = mymodel.predict(X_test)
    file = open(f'resources/model_k{k}.pkl', 'wb')
    # dump my model into the file specified
    pickle.dump(mymodel, file)
    file.close()