## KNN on Iris Dataset

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import plotly.express as px

In [2]:
# Load in Iris dataset
dfload = load_iris()

# Create df from iris dataset
df = pd.DataFrame(dfload.data,
                  columns=dfload.feature_names)
df.columns = ['sep_len',
              'sep_wid',
              'pet_len',
              'pet_wid']

# Add target col
df['target'] = dfload.target

# Create dict from target_keys
dkeys = [0, 1, 2]
dvals = dfload.target_names.tolist()
targetmap = dict(zip(dkeys, dvals))

# Duplicate target col
df['targetn'] = df['target']

# Replace target vals with names
df['targetn'] = df['target'].map(targetmap)

In [3]:
# Plot iris data by flower name
fig = px.scatter(df,
                x=df.pet_len,
                y=df.pet_wid,
                color=df.targetn,
                template='seaborn')
fig.show()

In [5]:
# Prepare df for train_test_split
X = df.drop(['target',
             'targetn'],
            axis=1)
y = df.target

# Split df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

# Set KNeighbors and fit model to training data
knn = KNeighborsClassifier(10)
knn.fit(X_train,
        y_train)

# Test model efficacy against test data
knn.score(X_test,
          y_test)

0.9666666666666667

In [6]:
# Create confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = knn.predict(X_test)

conf_mtx = confusion_matrix(y_test,
                            y_pred)

In [7]:
conf_mtx

array([[11,  0,  0],
       [ 0, 12,  1],
       [ 0,  0,  6]])

In [8]:
dfload.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [9]:
fig = px.imshow(conf_mtx,
                labels=dict(x='Prediction',
                            y='Test True',
                            color='# Correct'),
                x=['Setosa',
                   'Versicolor',
                   'Virginica'],
                y=['Setosa',
                   'Versicolor',
                   'Virginica'],
                template='ggplot2')

fig.update_xaxes(side="top")

fig.update_layout(autosize=False,
                  width=600,
                  height=600)

fig.show()

In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test,
                            y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      0.92      0.96        13
           2       0.86      1.00      0.92         6

    accuracy                           0.97        30
   macro avg       0.95      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30

