In [25]:
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
from collections import Counter 
import time
%matplotlib inline

In [26]:
def split_data(train_data,train_y,split_ratio):
    
    train_data['target'] = train_y
    if isinstance(split_ratio,int):
        test_size = split_ratio
    else:
        test_size = round(split_ratio * len(train_data))
    
    
    incides = train_data.index.tolist()
    test_indices = random.sample(population=incides,k=test_size)
    
    test = train_data.loc[test_indices]
    train = train_data.drop(test_indices)
    
    train_y = train['target'].reset_index(drop=True)
    train_X = train.drop(['target'],axis=1).reset_index(drop=True)
    
    test_y = test['target'].reset_index(drop=True)
    test_X = test.drop(['target'],axis=1).reset_index(drop=True)
    
    return train_X,train_y,test_X,test_y

In [27]:
def accuracy(prediction,actual):
    Accuracy= (prediction==actual)
    Accuracy = Accuracy.mean()
    
    return Accuracy

In [28]:
def RMSE(prediction,actual):
    
    actual = actual.values
    total_error = 0
    
    for i in range(len(prediction)):
        error = (prediction[i] - actual[i])**2
        total_error += error
    
    mean_error = total_error/len(prediction)
    rmse = mean_error ** (1/2)
    
    return rmse

In [29]:
iris = datasets.load_iris()
train = pd.DataFrame(iris['data'])
train_y = iris['target']
train_X,train_y,test_X,test_y = split_data(train,train_y,split_ratio=30)

In [30]:
output = []
k = 2
for i in range(len(test_X)):
    distance_tuple = []
    for ii in range(len(train_X)):
        distance_tuple.append((math.sqrt(sum([(a - b) ** 2 for a, b in zip(train_X.loc[ii], test_X.loc[i])])),train_y[ii]))
    distance_tuple.sort()
    neighbors = distance_tuple[0:k]
    neighbors_class = [i[1] for i in neighbors]
    data = Counter(neighbors_class) 
    get_mode = dict(data) 
    mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
    output.append(mode[0])

In [31]:
sample_train_X,sample_train_y,sample_test_X,sample_test_y = split_data(train_X,train_y,split_ratio=0.90)

In [32]:
k = 3
for i in range(len(sample_test_X)):
    distance_tuple = []
    for ii in range(len(sample_train_X)):
        distance_tuple.append((math.sqrt(sum([(a - b) ** 2 for a, b in zip(sample_train_X.loc[ii], sample_test_X.loc[i])])),sample_train_y[ii]))
    distance_tuple.sort()
    neighbors = distance_tuple[0:k]
    neighbors_class = [i[1] for i in neighbors]
    data = Counter(neighbors_class) 
    get_mode = dict(data) 
    mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
    if mode != sample_test_y[i]:
        current_train_size = len(sample_train_X)
        sample_train_X=sample_train_X.append(sample_test_X.loc[i],ignore_index=True)
        sample_test_y_value = sample_test_y[i].item()
        sample_train_y = np.append(sample_train_y,[sample_test_y_value],axis=0)

In [33]:
sample_train_y.shape

(20,)

In [34]:
type(sample_test_y[i])

numpy.int32

In [35]:
sample_test_y[1]

1

In [41]:
class ModifiedknnClassifier:
    def __init__(self):
        pass
    
    def fit(self,train_X,train_y,k=5):
        k = 3
        sample_train_X,sample_train_y,sample_test_X,sample_test_y = split_data(train_X,train_y,split_ratio=0.70)
        for i in range(len(sample_test_X)):
            distance_tuple = []
            for ii in range(len(sample_train_X)):
                distance_tuple.append((math.sqrt(sum([(a - b) ** 2 for a, b in zip(sample_train_X.loc[ii], sample_test_X.loc[i])])),sample_train_y[ii]))
            distance_tuple.sort()
            neighbors = distance_tuple[0:k]
            neighbors_class = [i[1] for i in neighbors]
            data = Counter(neighbors_class) 
            get_mode = dict(data) 
            mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
            if mode != sample_test_y[i]:
                current_train_size = len(sample_train_X)
                sample_train_X=sample_train_X.append(sample_test_X.loc[i],ignore_index=True)
                sample_test_y_value = sample_test_y[i].item()
                sample_train_y = np.append(sample_train_y,[sample_test_y_value],axis=0)
        
        
        self.X = sample_train_X
        self.Y = sample_train_y
        self.k = k
        
    def predict(self,test_X):
        output = []
        for i in range(len(test_X)):
            distance_tuple = []
            for ii in range(len(self.X)):
                distance_tuple.append((math.sqrt(sum([(a-b)**2 for a,b in zip(test_X.loc[i],self.X.loc[ii])])),self.Y[ii]))
            distance_tuple.sort()
            neighbors = distance_tuple[0:k]
            neighbors_class = [i[1] for i in neighbors]
            data = Counter(neighbors_class) 
            get_mode = dict(data) 
            mode = [k for k, v in get_mode.items() if v == max(list(data.values()))] 
            output.append(mode[0])
        
        return output


In [42]:
iris = datasets.load_iris()
train = pd.DataFrame(iris['data'])
train_y = iris['target']
train_X,train_y,test_X,test_y = split_data(train,train_y,split_ratio=50)

In [43]:

clf = ModifiedknnClassifier()
start = time.time()
clf.fit(train_X,train_y,5)
end = time.time()
print('Processing time : {}'.format(end-start))

Processing time : 0.9647817611694336


In [44]:
start = time.time()
prediction = clf.predict(test_X)
end = time.time()
print('Processing time : {}'.format(end-start))

Processing time : 0.8310015201568604


In [40]:
print(accuracy(prediction,test_y))

0.94


In [86]:
clf.X

Unnamed: 0,0,1,2,3
0,4.9,3.1,1.5,0.1
1,5.4,3.7,1.5,0.2
2,5.8,4.0,1.2,0.2
3,5.7,4.4,1.5,0.4
4,5.4,3.9,1.3,0.4
5,5.1,3.5,1.4,0.3
6,5.1,3.7,1.5,0.4
7,5.0,3.4,1.6,0.4
8,5.4,3.4,1.5,0.4
9,4.9,3.1,1.5,0.2
