In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

fruits = pd.read_table('fruit_data_with_colors.txt')

In [2]:
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
lookup_fruit_name = dict(zip(fruits.fruit_label.unique()
                            , fruits.fruit_name.unique()))
lookup_fruit_name

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

## Create train-test split

We will usually put the feature columns, without the labels, into a dataframe X
and the column of labels into a dataframe y.

In [46]:
X = fruits[['mass', 'width', 'height']]
y = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [47]:
# create pair plot
from matplotlib import cm
cmap = cm.get_cmap('gnuplot')
scatter = pd.plotting.scatter_matrix(X_train, c=y_train, marker='0', s=40
                                     , hist_kwds={'bins':15}, figsize=(8, 8)
                                     , cmap=cmap)

<IPython.core.display.Javascript object>

In [29]:
# create 3d scatterplot
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, projection = '3d')
ax.scatter(X_train['width'], X_train['height'], X_train['mass']
          , c=y_train, marker='o', s=100)
ax.set_xlabel('width')
ax.set_ylabel('height')
ax.set_zlabel('mass')
plt.show()

<IPython.core.display.Javascript object>

k-nn is an instance based or memory based supervised learning

## K-nn

### Create classifier object

In [48]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

### Train the classifier (fit the estimator) using the training data

In [49]:
# knn is the classifier (estimator).  Is this the same as saying that knn is
# the model?
# let's train the estimator
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

### Estimate the accuracy of the classifier on future data, using the test data

In [50]:
# after training the knn classifier using training data, test it's accuracy
# with the test data using the score method
knn.score(X_test, y_test)

0.53333333333333333

### Use the trained k-NN classifier model to classify new, previously unseen objects

In [56]:
fruit_prediction = knn.predict([[20, 4.3, 5.5]]) # mass, height, width inputs
lookup_fruit_name[fruit_prediction[0]]

'mandarin'

In [57]:
fruit_prediction = knn.predict([[100, 6.3, 8.5]])
lookup_fruit_name[fruit_prediction[0]]

'lemon'

### Plot the decision boundaries of the k-NN classifier

In [60]:
from adspy_shared_utilities import plot_fruit_knn

plot_fruit_knn(X_train, y_train, 5, 'uniform')

ModuleNotFoundError: No module named 'graphviz'