# Simple classification of Iris dataset using K Nearest Neighbors classifier

## Import packages

In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

## Load data

In [2]:
data = pd.read_csv('iris.data')
data

Unnamed: 0,sl,sw,pl,pw,iris
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


## Create the model

In [3]:
model = KNeighborsClassifier()
samples = data.values[:,0:4]
labels = data.values[:,4]
model.fit(samples, labels)    
predicted = model.predict([(5.9, 3.0, 5.1, 1.8)])
print("predicted",predicted)

predicted ['Iris-virginica']


## Train-test split (with and without stratification) and model fit

In [6]:
from collections import Counter
print("Labels distribution:",Counter(labels))

# train-test split
(trainSamples, testSamples, trainLabels, testLabels) = sklearn.model_selection.train_test_split(samples, labels, random_state=30
    , stratify=labels
    #, train_size=0.5                                                                                            
    )

print("Train labels distribution:", Counter(trainLabels))
print("Test labels distribution:",Counter(testLabels))

model = KNeighborsClassifier()
model.fit(trainSamples, trainLabels)    

Labels distribution: Counter({'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50})
Train labels distribution: Counter({'Iris-setosa': 38, 'Iris-virginica': 37, 'Iris-versicolor': 37})
Test labels distribution: Counter({'Iris-versicolor': 13, 'Iris-virginica': 13, 'Iris-setosa': 12})


KNeighborsClassifier()

## Check an error for the test set

In [7]:
correct = 0;
predictedLabels = model.predict(testSamples)
for i in range(len(testSamples)):
    print(testLabels[i],"->",predictedLabels[i],end=' ')
    if(testLabels[i]==predictedLabels[i]):
        correct = correct + 1; 
        print('OK')
    else:        
        print('error!!!')
print("Correct:",correct," of ",len(testSamples)," accuracy=",correct/len(testSamples))
print("Correct: {} of {} accuracy = {:.2f}".format(correct,len(testSamples),correct/len(testSamples)))

Iris-versicolor -> Iris-versicolor OK
Iris-setosa -> Iris-setosa OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-versicolor OK
Iris-virginica -> Iris-virginica OK
Iris-setosa -> Iris-setosa OK
Iris-versicolor -> Iris-virginica error!!!
Iris-setosa -> Iris-setosa OK
Iris-virginica -> Iris-virginica OK
Iris-virginica -> Iris-virginica OK
Iris-virginica -> Iris-virginica OK
Iris-setosa -> Iris-setosa OK
Iris-versicolor -> Iris-virginica error!!!
Iris-setosa -> Iris-setosa OK
Iris-versicolor -> Iris-versicolor OK
Iris-virginica -> Iris-virginica OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-versicolor OK
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-versicolor -> Iris-versicolor OK
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-versicolor OK
Iris-setosa -> Iris-setosa OK
Iris-setosa -> Iris-setosa OK
Iris-virginica -> Iris-virginica OK
Iris-versicolor -> Iris-versicol

## Cross validation

In [8]:
sklearn.model_selection.cross_validate(model, samples, labels, cv=10)

{'fit_time': array([0.00099826, 0.00099897, 0.0010283 , 0.00199771, 0.0019896 ,
        0.0010016 , 0.00203753, 0.00100064, 0.00201583, 0.00099993]),
 'score_time': array([0.00200081, 0.00199604, 0.00297332, 0.00300431, 0.00303578,
        0.00199771, 0.00196481, 0.00300336, 0.00498033, 0.00099707]),
 'test_score': array([1.        , 0.93333333, 1.        , 1.        , 0.86666667,
        0.93333333, 0.93333333, 1.        , 1.        , 1.        ])}

## Measures calculation

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score
modelResults = model.predict(testSamples)
print(confusion_matrix(testLabels, modelResults))
print(classification_report(testLabels, modelResults))
accuracy = accuracy_score(testLabels, modelResults) 
print("Accuracy: {:.2f}".format(accuracy))
print("Cohen's Kappa: {:.2f}".format(cohen_kappa_score(testLabels, modelResults)))

[[12  0  0]
 [ 0 11  2]
 [ 0  0 13]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       1.00      0.85      0.92        13
 Iris-virginica       0.87      1.00      0.93        13

       accuracy                           0.95        38
      macro avg       0.96      0.95      0.95        38
   weighted avg       0.95      0.95      0.95        38

Accuracy: 0.95
Cohen's Kappa: 0.92


## Adding own scorers to cross validation

In [10]:
from sklearn.metrics import make_scorer, precision_score, recall_score
p_scorer = make_scorer(precision_score, average='micro')
r_scorer = make_scorer(recall_score, average='micro')
my_scorer = {'precision': p_scorer, 'recall': r_scorer}
sklearn.model_selection.cross_validate(model, samples, labels, cv=5, scoring=my_scorer)

{'fit_time': array([0.0010035 , 0.0020082 , 0.00096321, 0.        , 0.00202179]),
 'score_time': array([0.00399566, 0.00634027, 0.00400567, 0.00498414, 0.00498056]),
 'test_precision': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'test_recall': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])}