## Load and inspect the dataset

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
from sklearn import set_config
set_config(display="diagram")

In [None]:
# Load scikit learn and load the dataset
from sklearn.datasets import load_iris # conda install scikit-learn, optional: conda install scikit-learn-intelex
iris_dataset = load_iris()

In [None]:
type(iris_dataset)

In [None]:
print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))

In [None]:
print("First five columns of data:\n{}".format(iris_dataset['data'][:5]))
print("\n")
print("Targets:\n{}".format(iris_dataset['target'][:]))
print("\n")
print("Target names:\n{}".format(iris_dataset['target_names']))
print("\n")
print("Feature names:\n{}".format(iris_dataset['feature_names']))
print("\n")
print("Dataset location:\n{}".format(iris_dataset['filename']))

In [None]:
print(iris_dataset['DESCR'])

## Prepare data for training the model

In [None]:
# We need to create both a training set AND a testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        iris_dataset['data'], 
        iris_dataset['target'], 
        test_size=0.25, # represent the proportion of the dataset to include in the test split.
        random_state=42 # an int for reproducible output across multiple function calls.
)

In [None]:
print("X_train type: ", type(X_train))
print("X_train shape: {}".format(X_train.shape)) 
print("y_train shape: {}".format(y_train.shape))
print("X_test shape: {}".format(X_test.shape)) 
print("y_test shape: {}".format(y_test.shape))

In [None]:
# conda install matplotlib
import pandas as pd
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)

print(iris_dataframe)
print(y_train)

grr = pd.plotting.scatter_matrix(
    iris_dataframe,
    c=y_train, # color by category.
    figsize=(15, 15),
    marker='o',
    hist_kwds={'bins': 20}, # keywords to hist function.
    s=60, # marker size in points**2 (typographic points are 1/72 in. Default is rcParams['lines.markersize'] ** 2.
    alpha=.8
)


## Build the model

In [None]:
# Import one of many classification algorithms.
from sklearn.neighbors import KNeighborsClassifier

n_neighbors=1

# This is a object containing the algorithm that build the model.
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train[:,2:], y_train) # Slicing notation for numpyarray. We only want the two last columns / dimensions.

![Choosing the right estimator](ml_map.svg)

[https://scikit-learn.org/stable/machine_learning_map.html](https://scikit-learn.org/stable/machine_learning_map.html)

## Visualize our model

In [None]:
import numpy as np

# Numpy arange()
print(np.arange(1, 2, 0.2))


# Numpy ravel()
a = np.array([[1, 2], [3, 4]])
b = np.ravel(a)
print(b)

# Numpy columnstacking: Translates slice objects to concatenation along the second axis.
stack_a = np.c_[np.array([1,2,3]), np.array([4,5,6])]
stack_b = np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]
print(stack_a)
print(stack_b)

In [None]:
# Lets visualize our trained model. The result will show the decision boundaries of the model.
# Plot our training data set on top of a coloured domain in which each color represents a category in our model.
import numpy as np

x_min,x_max = X_train[:,2].min() - 1, X_train[:,2].max()+ 1 # Get boundaries X.
y_min,y_max = X_train[:,3].min() - 1, X_train[:,3].max()+ 1 # Get boundaries Y.

print("Our boundaries")
print(x_min,x_max)
print(y_min,y_max)

h=0.02 # Spacing between values.
xx,yy = np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h)) # np.arange() return evenly spaced values within a given interval.

print("\nResult of meshgrid based on our boundaries")
print(xx)
print(yy)

In [None]:
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]) # Predict the class labels for the provided data.
Z = Z.reshape(xx.shape) # Gives a new shape to an array without changing its data.

print(type(Z))
print(Z.shape)
print(Z)

In [None]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

# Define a suitable colorscheme.
cmap_bold = ListedColormap(['darkorange', 'blue', 'darkblue'])
cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])

# Plot setup.
fig = plt.figure()
ax1 = fig.add_subplot(111) # (nrows, ncols, index)
ax1.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='gouraud') # Create a pseudocolor plot with a non-regular rectangular grid.

# Plot our observations.
for target in iris_dataset.target_names:
    index=np.where(iris_dataset.target_names==target)[0][0]
    ax1.scatter(
        X_train[:,2][y_train==index],
        X_train[:,3][y_train==index],
        cmap=cmap_bold,
        edgecolor='k',
        s=20,
        label=target
    )

# Some housekeeping on the plot.
ax1.set_xlim(x_min,x_max)
ax1.set_ylim(y_min,y_max)
ax1.legend()
ax1.set_xlabel("petal length (cm)")
ax1.set_ylabel("petal width (cm)")
ax1.set_title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, 'uniform'))
plt.show()

## Use the model

In [None]:
# Making a prediction.
new_data = np.array([[4,3.5,1.2,0.5]]) # sepal length (cm),  sepal width (cm),  petal length (cm),  petal width (cm)
prediction = knn.predict(new_data[:,2:]) #  We trained only with the two last columns / dimensions.
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

## Validate the model

In [None]:
# Validate the model using the testdata we prepared earlier.
y_pred = knn.predict(X_test[:,2:]) # We trained only with the two last columns / dimensions.
print("Test set predictions:\n {}".format(y_pred))

In [None]:
print("Test set score: {:.2f}".format(knn.score(X_test[:,2:], y_test)))

## Oppsummering

k-Nearest Neighbors is a simple classification algorithm in which predictions a new data point to the closest data points in the training dataset.

It is not necessary to use all the features in our training dataset. We can use different combinations to try to achive better results.