In [69]:
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
from collections import Counter 
%matplotlib inline

In [10]:
def split_data(train_data,train_y,split_ratio):
    
    train_data['target'] = train_y
    if isinstance(split_ratio,int):
        test_size = split_ratio
    else:
        test_size = round(split_ratio * len(train_data))
    
    
    incides = train_data.index.tolist()
    test_indices = random.sample(population=incides,k=test_size)
    
    test = train_data.loc[test_indices]
    train = train_data.drop(test_indices)
    
    train_y = train['target'].reset_index(drop=True)
    train_X = train.drop(['target'],axis=1).reset_index(drop=True)
    
    test_y = test['target'].reset_index(drop=True)
    test_X = test.drop(['target'],axis=1).reset_index(drop=True)
    
    return train_X,train_y,test_X,test_y

In [11]:
def accuracy(prediction,actual):
    Accuracy= (prediction==actual)
    Accuracy = Accuracy.mean()
    
    return Accuracy

In [12]:
def RMSE(prediction,actual):
    
    actual = actual.values
    total_error = 0
    
    for i in range(len(prediction)):
        error = (prediction[i] - actual[i])**2
        total_error += error
    
    mean_error = total_error/len(prediction)
    rmse = mean_error ** (1/2)
    
    return rmse

In [13]:
iris = datasets.load_iris()
train = pd.DataFrame(iris['data'])
train_y = iris['target']
train_X,train_y,test_X,test_y = split_data(train,train_y,split_ratio=50)

In [83]:
output = []
k = 2
for i in range(len(test_X)):
    distance_tuple = []
    for ii in range(len(train_X)):
        distance_tuple.append((math.sqrt(sum([(a - b) ** 2 for a, b in zip(train_X.loc[ii], test_X.loc[i])])),train_y[ii]))
    distance_tuple.sort()
    neighbors = distance_tuple[0:k]
    neighbors_class = [i[1] for i in neighbors]
    data = Counter(neighbors_class) 
    get_mode = dict(data) 
    mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
    output.append(mode[0])

        

In [85]:
class knnClassifier:
    def __init__(self):
        pass
    
    def fit(self,train_X,train_y,k=5):
        self.X = train_X
        self.Y = train_y
        self.k = k
        
    def predict(self,test_X):
        output = []
        for i in range(len(test_X)):
            distance_tuple = []
            for ii in range(len(self.X)):
                distance_tuple.append((math.sqrt(sum([(a-b)**2 for a,b in zip(test_X.loc[i],self.X.loc[ii])])),self.Y[ii]))
            distance_tuple.sort()
            neighbors = distance_tuple[0:k]
            neighbors_class = [i[1] for i in neighbors]
            data = Counter(neighbors_class) 
            get_mode = dict(data) 
            mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
            output.append(mode[0])
        
        return output

        

In [86]:
clf = knnClassifier()
clf.fit(train_X,train_y,5)
prediction = clf.predict(test_X)

In [88]:
print(accuracy(prediction,test_y))

0.96


In [92]:
boston = datasets.load_boston()
train = pd.DataFrame(boston['data'])
target = boston['target']
train_X,train_y,test_X,test_y = split_data(train,target,split_ratio=0.3)

In [95]:
output = []
k = 5
for i in range(len(test_X)):
    distance_tuple = []
    for ii in range(len(train_X)):
        distance_tuple.append((math.sqrt(sum([(a - b) ** 2 for a, b in zip(train_X.loc[ii], test_X.loc[i])])),train_y[ii]))
    distance_tuple.sort()
    neighbors = distance_tuple[0:k]
    weighted_total_neighbors = sum([i[1]/i[0] for i in neighbors])
    
    total_distance = sum([1/i[0] for i in neighbors])
    
    neighbors_weighted_average = weighted_total_neighbors/total_distance
    
    output.append(neighbors_weighted_average)


In [96]:
output

[16.223617731013377,
 24.327692205446287,
 20.162632765561806,
 21.761256742777324,
 30.20990000430482,
 20.640912840918798,
 26.23775771226239,
 31.611313534350387,
 21.222730996287382,
 16.455746048696604,
 22.80879828931695,
 22.815783975757544,
 22.170495983155643,
 22.434934480000265,
 21.68726059842173,
 15.591695459076977,
 18.05040523498282,
 37.87379100775568,
 18.325160912177182,
 24.308914469697303,
 20.98049969526455,
 27.216502614568007,
 22.096801454478538,
 11.197232688041314,
 28.613714379906835,
 33.75005532116085,
 27.877280028398307,
 23.26146993727667,
 13.216124063979553,
 37.4280330637936,
 9.857733350074872,
 25.92054322301247,
 26.73908115644056,
 34.417032733994546,
 11.866013244748478,
 20.859708190341372,
 20.487664746003528,
 16.69623056955291,
 35.797715957047885,
 11.640930692559412,
 17.211379584016345,
 21.480554202300354,
 15.607680000209418,
 14.343569257090797,
 21.58363870733905,
 36.50963700837855,
 30.23029228561611,
 21.979423146269653,
 10.667001

In [97]:
class knnRegressor:
    def __init__(self):
        pass
    
    def fit(self,train_X,train_y,k=5):
        self.X = train_X
        self.Y = train_y
        self.k = k
        
    def predict(self,test_X):
        output = []
        for i in range(len(test_X)):
            distance_tuple = []
            for ii in range(len(self.X)):
                distance_tuple.append((math.sqrt(sum([(a-b)**2 for a,b in zip(test_X.loc[i],self.X.loc[ii])])),self.Y[ii]))
            distance_tuple.sort()
            neighbors = distance_tuple[0:k]
            weighted_total_neighbors = sum([i[1]/i[0] for i in neighbors])
            total_distance = sum([1/i[0] for i in neighbors])
            neighbors_weighted_average = weighted_total_neighbors/total_distance
            output.append(neighbors_weighted_average)
        
        return output


In [98]:
boston = datasets.load_boston()
train = pd.DataFrame(boston['data'])
target = boston['target']
train_X,train_y,test_X,test_y = split_data(train,target,split_ratio=0.3)

In [101]:
clf = knnRegressor()
clf.fit(train_X,train_y,3)
prediction = clf.predict(test_X)

In [102]:
print(RMSE(prediction,test_y))

6.021817963789219
