# Simple classification of Iris dataset using K Nearest Neighbors classifier

## Import packages

In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

## Load data

In [2]:
data = pd.read_csv('iris.data')
data

Unnamed: 0,sl,sw,pl,pw,iris
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


## Create the model

In [3]:
model = KNeighborsClassifier()
samples = data.values[:,0:4]
labels = data.values[:,4]
model.fit(samples, labels)    
predicted = model.predict([(5.9,3.0,5.1,1.8)])
print("predicted",predicted)

predicted ['Iris-virginica']


## Train-test split (with and without stratification) and model fit

In [4]:
from collections import Counter
print("Labels distribution:",Counter(labels))

# train-test split
(trainSamples, testSamples, trainLabels, testLabels) = sklearn.model_selection.train_test_split(samples, labels, random_state=30
    #, stratify=labels
    #, train_size=0.5                                                                                            
    )

print("Train labels distribution:", Counter(trainLabels))
print("Test labels distribution:",Counter(testLabels))

model = KNeighborsClassifier()
model.fit(trainSamples, trainLabels)    

Labels distribution: Counter({'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50})
Train labels distribution: Counter({'Iris-versicolor': 39, 'Iris-setosa': 38, 'Iris-virginica': 35})
Test labels distribution: Counter({'Iris-virginica': 15, 'Iris-setosa': 12, 'Iris-versicolor': 11})


KNeighborsClassifier()

## Check an error for the test set

In [5]:
correct = 0;
predictedLabels = model.predict(testSamples)
for i in range(len(testSamples)):
    print(testLabels[i],"->",predictedLabels[i],end=' ')
    if(testLabels[i]==predictedLabels[i]):
        correct = correct + 1; 
        print('OK')
    else:        
        print('error!!!')
print("Correct:",correct," of ",len(testSamples)," accuracy=",correct/len(testSamples))
print("Correct: {} of {} accuracy = {:.2f}".format(correct,len(testSamples),correct/len(testSamples)))

Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-versicolor OK
Iris-versicolor -> Iris-versicolor OK
Iris-virginica -> Iris-virginica OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-versicolor OK
Iris-virginica -> Iris-virginica OK
Iris-setosa -> Iris-setosa OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-versicolor OK
Iris-versicolor -> Iris-versicolor OK
Iris-setosa -> Iris-setosa OK
Iris-versicolor -> Iris-versicolor OK
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-versicolor -> Iris-virginica error!!!
Iris-virginica -> Iris-versicolor error!!!
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-virginica -> Iris-virginica OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-virginica error!!!
Iris-virginica -> Iris-virginica OK
Iris-setosa -> Iris-setos

## Cross validation

In [6]:
sklearn.model_selection.cross_validate(model, samples, labels, cv=10)

{'fit_time': array([0.0009985 , 0.00101089, 0.00100613, 0.        , 0.00099635,
        0.00099421, 0.00099277, 0.        , 0.        , 0.        ]),
 'score_time': array([0.00498962, 0.00397539, 0.00297356, 0.00296283, 0.00203133,
        0.00100231, 0.00099587, 0.00200629, 0.00198555, 0.00099611]),
 'test_score': array([1.        , 0.93333333, 1.        , 1.        , 0.86666667,
        0.93333333, 0.93333333, 1.        , 1.        , 1.        ])}

## Measures calculation

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score
modelResults = model.predict(testSamples)
print(confusion_matrix(testLabels, modelResults))
print(classification_report(testLabels, modelResults))
accuracy = accuracy_score(testLabels, modelResults) 
print("Accuracy: {:.2f}".format(accuracy))
print("Cohen's Kappa: {:.2f}".format(cohen_kappa_score(testLabels, modelResults)))

[[12  0  0]
 [ 0  9  2]
 [ 0  1 14]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       0.90      0.82      0.86        11
 Iris-virginica       0.88      0.93      0.90        15

       accuracy                           0.92        38
      macro avg       0.92      0.92      0.92        38
   weighted avg       0.92      0.92      0.92        38

Accuracy: 0.92
Cohen's Kappa: 0.88


## Adding own scorers to cross validation

In [8]:
from sklearn.metrics import make_scorer, precision_score, recall_score
p_scorer = make_scorer(precision_score, average='micro')
r_scorer = make_scorer(recall_score, average='micro')
my_scorer = {'precision': p_scorer, 'recall': r_scorer}
sklearn.model_selection.cross_validate(model, samples, labels, cv=5, scoring=my_scorer)

{'fit_time': array([0.00099659, 0.00100303, 0.00099587, 0.        , 0.00099754]),
 'score_time': array([0.00398898, 0.00498199, 0.00199628, 0.00302553, 0.00295591]),
 'test_precision': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'test_recall': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])}