In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn import metrics
import numpy as np

In [2]:
# Load dataset from sklearn.
iris = datasets.load_iris()

In [3]:
# Data in x and y below represented as numpy arrays
x = iris.data
y = iris.target

In [4]:
# x is the data/features of the dataset - all sepal and petal measurements in cm.
# Represented here as a numpy array of arrays.
x

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [5]:
# y is the labels - iris species.
# Represented here as a numpy array: 0 = setosa; 1 = versicolor; 2 = virginica.
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
# The dataset is split to provide a portion to train the model and a portion to test the model.
# 80% training and 20% testing (determined to be marginally more accurate than a 70/30 split - see section 5.6).
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [7]:
# K-Nearest Neighbour model.
# Set it to examine the 13 closest neighbours (determined to be most accurate in testing - see section 5.6)
knn = knc(n_neighbors = 13)

In [8]:
# Pass the x and y training values through the KNN fit function to train the model.
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=13, p=2,
           weights='uniform')

In [9]:
# Testing the model with the test data.
test_prediction = knn.predict(x_test)
test_prediction # predictions listed as array representation                

array([0, 1, 2, 2, 1, 0, 0, 2, 0, 2, 2, 2, 0, 1, 2, 0, 2, 0, 0, 1, 1, 1,
       2, 0, 1, 2, 2, 0, 2, 1])

In [10]:
# Check the accuracy of the above test predictions.
metrics.accuracy_score(y_test, test_prediction)

1.0

In [11]:
# Create new data array to further test the model. 
# numpy array of 4 measurements for 6 hypothetical iris plants.
# First 3 made up of means and second 3 made up of maximum values.
new_data = np.array([[5, 3.4, 1.5, 0.2], [5.9, 2.8, 4.2, 1.3], [6.6, 3, 5.5, 2], [5.8, 4.4, 1.9, 0.6], [7, 3.4, 5.1, 1.8], [7.9, 3.8, 6.9, 2.5]]) 

In [12]:
# Apply the new data to the prediction function.
new_data_prediction = knn.predict(new_data) 
new_data_prediction # returns array representation 

array([0, 1, 2, 0, 2, 2])

In [13]:
# To see species names in the prediction instead of the array.
new_data_prediction = iris.target_names[knn.predict(new_data)]
new_data_prediction

array(['setosa', 'versicolor', 'virginica', 'setosa', 'virginica',
       'virginica'], dtype='<U10')