In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# K Nearest Neighbours

This algorithm picks the K Nearest Neighbors to check which class the datapoints correspond to. The KNN algorithm assumes that similar things exist in close proximity. In other words, similar things are near to each other. It doesn't handle categorical features. This is a fundamental weakness of kNN. kNN doesn't work great in general when features are on different scales. This is especially true when one of the 'scales' is a category label. You have to decide how to convert categorical features to a numeric scale, and somehow assign inter-category distances in a way that makes sense with other features. 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

iris = load_iris()
dir(iris)
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['target'] = iris.target

df.head()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2)

model = KNeighborsClassifier(n_neighbors=11)
model.fit(X_train, y_train)
model.score(X_test, y_test)
model.predict(X_test)

['DESCR',
 'data',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


KNeighborsClassifier(n_neighbors=11)

0.9666666666666667

array([0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1, 0, 2, 0, 1, 2, 1, 2, 2, 0, 0,
       1, 2, 1, 2, 2, 1, 2, 0])

In [3]:
import numpy as np
from statistics import mode
from sklearn.model_selection import StratifiedKFold

def k_nearest_neighbors(X_train, y_train, X_test, y_test, k):
    decision = []
    for test_point in X_test:
        dist = []
        index = 0;
        for data_point in X_train:
            d = np.sqrt(np.sum((test_point - data_point)**2))
            dist.append([d,y_train[index]])
            index+=1
            
        votes = [i[1] for i in sorted(dist)[:k]]
        decision.append(mode(votes))
        
    predict = np.array(decision)
    cmp = (predict == y_test)
    score = (cmp==True).sum() / len(cmp)
    
    return score,predict 

kf = StratifiedKFold(n_splits=5, shuffle = True)
scores=[]
for train_index, test_index in kf.split(iris.data, iris.target):
    X_train, X_test, y_train, y_test = iris.data[train_index], iris.data[test_index], \
                                       iris.target[train_index], iris.target[test_index]
    score,predict = k_nearest_neighbors(X_train, y_train, X_test, y_test, 11)
    scores.append(score)
    
cvscore = sum(scores)/ len(scores)
cvscore

0.9666666666666666