In [21]:
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
import time
from collections import Counter 
%matplotlib inline

In [22]:
def split_data(train_data,train_y,split_ratio):
    
    train_data['target'] = train_y
    if isinstance(split_ratio,int):
        test_size = split_ratio
    else:
        test_size = round(split_ratio * len(train_data))
    
    
    incides = train_data.index.tolist()
    test_indices = random.sample(population=incides,k=test_size)
    
    test = train_data.loc[test_indices]
    train = train_data.drop(test_indices)
    
    train_y = train['target'].reset_index(drop=True)
    train_X = train.drop(['target'],axis=1).reset_index(drop=True)
    
    test_y = test['target'].reset_index(drop=True)
    test_X = test.drop(['target'],axis=1).reset_index(drop=True)
    
    return train_X,train_y,test_X,test_y

In [23]:
def accuracy(prediction,actual):
    Accuracy= (prediction==actual)
    Accuracy = Accuracy.mean()
    
    return Accuracy

In [24]:
def RMSE(prediction,actual):
    
    actual = actual.values
    total_error = 0
    
    for i in range(len(prediction)):
        error = (prediction[i] - actual[i])**2
        total_error += error
    
    mean_error = total_error/len(prediction)
    rmse = mean_error ** (1/2)
    
    return rmse

In [25]:
iris = datasets.load_iris()
train = pd.DataFrame(iris['data'])
train_y = iris['target']
train_X,train_y,test_X,test_y = split_data(train,train_y,split_ratio=50)

In [26]:
output = []
k = 3
for i in range(len(test_X)):
    distance_tuple = []
    for ii in range(len(train_X)):
        distance_tuple.append((math.sqrt(sum([(a - b) ** 2 for a, b in zip(train_X.loc[ii], test_X.loc[i])])),train_y[ii]))
    distance_tuple.sort()
    neighbors = distance_tuple[0:k]
    neighbors_class = [i[1] for i in neighbors]
    data = Counter(neighbors_class) 
    get_mode = dict(data) 
    mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
    output.append(mode[0])

        

In [27]:
print(accuracy(output,test_y))

0.96


In [37]:
class knnClassifier:
    def __init__(self):
        pass
    
    def fit(self,train_X,train_y,k=5):
        self.X = train_X
        self.Y = train_y
        self.k = k
        
    def predict(self,test_X):
        output = []
        for i in range(len(test_X)):
            distance_tuple = []
            for ii in range(len(self.X)):
                distance_tuple.append((math.sqrt(sum([(a-b)**2 for a,b in zip(test_X.loc[i],self.X.loc[ii])])),self.Y[ii]))
            distance_tuple.sort()
            neighbors = distance_tuple[0:k]
            neighbors_class = [i[1] for i in neighbors]
            data = Counter(neighbors_class) 
            get_mode = dict(data) 
            mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
            output.append(mode[0])
        
        return output

        

In [38]:
iris = datasets.load_iris()
train = pd.DataFrame(iris['data'])
train_y = iris['target']
train_X,train_y,test_X,test_y = split_data(train,train_y,split_ratio=50)

In [39]:

clf = knnClassifier()
start = time.time()
clf.fit(train_X,train_y,3)
end = time.time()
print('Processing time : {}'.format(end-start))

Processing time : 0.0


In [40]:
start = time.time()
prediction = clf.predict(test_X)
end = time.time()
print('Processing time : {}'.format(end-start))

Processing time : 2.466682195663452


In [41]:
print(accuracy(prediction,test_y))

0.98


In [11]:
boston = datasets.load_boston()
train = pd.DataFrame(boston['data'])
target = boston['target']
train_X,train_y,test_X,test_y = split_data(train,target,split_ratio=0.3)

In [12]:
output = []
k = 5
for i in range(len(test_X)):
    distance_tuple = []
    for ii in range(len(train_X)):
        distance_tuple.append((math.sqrt(sum([(a - b) ** 2 for a, b in zip(train_X.loc[ii], test_X.loc[i])])),train_y[ii]))
    distance_tuple.sort()
    neighbors = distance_tuple[0:k]
    weighted_total_neighbors = sum([i[1]/i[0] for i in neighbors])
    
    total_distance = sum([1/i[0] for i in neighbors])
    
    neighbors_weighted_average = weighted_total_neighbors/total_distance
    
    output.append(neighbors_weighted_average)


In [13]:
output

[21.122726816780858,
 22.41064544441093,
 27.407120880329717,
 18.8071018994773,
 20.371410879247158,
 25.544028527962265,
 25.888656061405033,
 26.36434888100778,
 29.964749954874595,
 20.677821545690787,
 24.508748466318657,
 26.084515341738218,
 31.70541386226417,
 21.941002247142947,
 18.42858576182911,
 36.61985602309986,
 18.171341016196997,
 22.131863828583462,
 21.085130101817562,
 22.131694319671315,
 40.69134665721515,
 26.14942573177664,
 34.66064376573952,
 17.57129384603103,
 29.033447499788064,
 35.03388126867651,
 32.456428678015385,
 23.303691038239325,
 17.589040058456824,
 24.273465480030612,
 24.816103685036435,
 45.18308795723738,
 13.124209549262659,
 16.695543413848167,
 21.923373486767268,
 20.8790996835786,
 18.338285026953034,
 20.244759168749216,
 27.420269311992882,
 18.538273570906785,
 15.733223945891972,
 13.769958104942152,
 22.317012452068372,
 21.32053367420041,
 14.487057674932705,
 27.49271248598428,
 12.729211272866312,
 21.890148824964534,
 29.47532

In [14]:
class knnRegressor:
    def __init__(self):
        pass
    
    def fit(self,train_X,train_y,k=5):
        self.X = train_X
        self.Y = train_y
        self.k = k
        
    def predict(self,test_X):
        output = []
        for i in range(len(test_X)):
            distance_tuple = []
            for ii in range(len(self.X)):
                distance_tuple.append((math.sqrt(sum([(a-b)**2 for a,b in zip(test_X.loc[i],self.X.loc[ii])])),self.Y[ii]))
            distance_tuple.sort()
            neighbors = distance_tuple[0:k]
            weighted_total_neighbors = sum([i[1]/i[0] for i in neighbors])
            total_distance = sum([1/i[0] for i in neighbors])
            neighbors_weighted_average = weighted_total_neighbors/total_distance
            output.append(neighbors_weighted_average)
        
        return output


In [18]:
boston = datasets.load_boston()
train = pd.DataFrame(boston['data'])
target = boston['target']
train_X,train_y,test_X,test_y = split_data(train,target,split_ratio=0.3)

In [19]:
clf = knnRegressor()
clf.fit(train_X,train_y,3)
prediction = clf.predict(test_X)

In [20]:
print(RMSE(prediction,test_y))

6.701344999633546
