In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mglearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris # load_iris is a classical dataset in machine learning and statistics

# load the iris dataset
iris_dataset = load_iris()
print("Keys of iris dataset: \n{}".format(iris_dataset.keys())) # the value of the key 'DESCR' is a description, essentially
print(iris_dataset['DESCR'][:193] + "\n...")
print("Target names: {}".format(iris_dataset['target_names']))
print("Feature names: {}".format(iris_dataset['feature_names']))
print("data: {}".format(type(iris_dataset['data'])))
print("Shape of data: {}".format(iris_dataset["data"].shape))
print("First five columns of data:\n{}".format(iris_dataset['data'][:5]))
print("Type of target: {}".format(iris_dataset['target'])) # 0 means setosa, 1 means versicolor, and 2 means virginica

Keys of iris dataset: 
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive 
...
Target names: ['setosa' 'versicolor' 'virginica']
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
data: <class 'numpy.ndarray'>
Shape of data: (150, 4)
First five columns of data:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
Type of target: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [2]:
# split the dataset into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split ( # X_train is the training data and y_train is the training label
    iris_dataset['data'], iris_dataset['target'], random_state=0
)

# print the shape of the training and testing data
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_train shape: (112, 4)
y_train shape: (112,)
X_test shape: (38, 4)
y_test shape: (38,)


In [5]:

# create a dataframe from the data in X_train
# label the columns using the strings in iris_dataset.feature_names
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)

# create a scatter matrix from the dataframe, color by y_train
# grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',
#  hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3)

# create a model
# from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1) # number of neighbours = 1, because for now, we're using just one neighbour.
knn.fit(X_train, y_train)

# make predictions
X_new = np.array([[5, 2.9, 1, 0.2]])
print("Shape of X_new: {}".format(X_new.shape))


Shape of X_new: (1, 4)


In [8]:

# create a model
# from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1) # number of neighbours = 1, because for now, we're using just one neighbour.
knn.fit(X_train, y_train)

# make predictions
X_new = np.array([[5, 2.9, 1, 0.2]])
print("Shape of X_new: {}".format(X_new.shape))

# make a prediction
prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(
 iris_dataset['target_names'][prediction]))


Shape of X_new: (1, 4)
Prediction: [0]
Predicted target name: ['setosa']


In [9]:

# evaluate the model
y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

# print the accuracy of the model
print("Test score: {:2f}".format(np.mean(y_pred == y_test) * 100)) # * 100 to show the model accuracy in percentage

Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
Test score: 97.368421
