In [1]:
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Inheriting BaseEstimator & TransformerMixin to make the class compatible with sklearn components
class KNNRegressor(BaseEstimator, TransformerMixin): 
    '''
    KNN regressor built from scratch
    
    Params:
     k (int): number of clusters
    '''
    def __init__(self, k: int=3):
        self.k = k
        self.x = None
        self.y = None
    
    
    @staticmethod
    def __euclidean_distance(x1, x2):
        '''
        Compute euclidean distance between arrays x1 and x2
        '''
        return np.sqrt(np.sum((x1 - x2) ** 2, axis=1))

    
    def fit(self, x: np.array, y: np.array):
        '''
        Stores x and y
        
        Params:
         x (numpy.array): Input features
         y (numpy.array): labels
        '''
        self.x = x
        self.y = y
        return self
    
    
    def predict(self, x: np.array):
        '''
        Prediction function
        
        Params:
         x (numpy.array): Input features for prediction
         
        Returns: 
         numpy.array: predicted labels
        
        '''
        # 1. computing euclidean distance between each sample in test set and training set
        # 2. Arg sorting the distances such that sample indices with lowest distance come first
        # 3. Selecting top 'k' sample indices
        # 4. Selecting labels of top 'k' sample indices
        # 5. computing mean of top 'k' labels and returning as prediction
        pred = [np.mean(self.y[np.argsort(self.__euclidean_distance(sample, self.x))[:self.k]]) for sample in x]
        return pred
    
    
    def __repr__(self):
        return f"KNNRegressor(k={self.k})"

In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
np.random.seed(42)
data = pd.DataFrame({'x1': np.arange(20000), 'x2': np.arange(20000, 0, -1), 'x3': np.random.randint(low=0, high=100000, size=20000)})
data['y'] = data.sum(axis=1)

In [5]:
data

Unnamed: 0,x1,x2,x3,y
0,0,20000,15795,35795
1,1,19999,860,20860
2,2,19998,76820,96820
3,3,19997,54886,74886
4,4,19996,6265,26265
...,...,...,...,...
19995,19995,5,39116,59116
19996,19996,4,56922,76922
19997,19997,3,56251,76251
19998,19998,2,61794,81794


In [6]:
x = data.drop(columns='y').values
y = data['y'].values
x.shape, y.shape

((20000, 3), (20000,))

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [8]:
model_pipeline = Pipeline(steps=[['minmax_scaler', MinMaxScaler()],
                                  ['knn', KNNRegressor()]])

In [9]:
param_grid = {'knn__k': [*range(1, 16)]}

In [10]:
grid_search = GridSearchCV(estimator=model_pipeline, 
                           param_grid=param_grid,
                           scoring="neg_mean_squared_error",
                           cv=3,
                           n_jobs=-1)

In [11]:
grid_search.fit(x_train, y_train)

In [12]:
grid_search.best_params_

{'knn__k': 8}

In [13]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__k,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.010414,0.007364,6.204118,0.099706,1,{'knn__k': 1},-246974.818084,-236081.288408,-239412.980069,-240823.028854,4557.662075,15
1,0.010414,0.007364,6.301046,0.127996,2,{'knn__k': 2},-187212.876527,-183631.516606,-181573.385555,-184139.259563,2330.138306,14
2,0.001224,0.00173,6.218052,0.158233,3,{'knn__k': 3},-173244.440421,-176900.863938,-162098.102467,-170747.802275,6295.783685,11
3,0.003657,0.00047,6.069857,0.026348,4,{'knn__k': 4},-163891.99238,-167971.507888,-157048.977925,-162970.826064,4506.426946,8
4,0.005538,0.002924,6.926268,0.04973,5,{'knn__k': 5},-160783.389792,-160941.856807,-151146.833596,-157624.026732,4580.52407,5
5,0.00133,0.001881,6.356477,0.393563,6,{'knn__k': 6},-158397.183505,-158968.898591,-149245.21965,-155537.100582,4455.149738,3
6,0.0,0.0,6.14816,0.065162,7,{'knn__k': 7},-161396.034458,-158681.655523,-151202.741521,-157093.477168,4310.262025,4
7,0.011729,0.005506,6.255254,0.017485,8,{'knn__k': 8},-156323.227706,-157880.113158,-152253.634236,-155485.658367,2372.123952,1
8,0.003529,0.000485,6.626652,0.484011,9,{'knn__k': 9},-152570.589786,-158932.359112,-155105.020119,-155535.989672,2614.998828,2
9,0.00638,0.006904,7.167558,0.592083,10,{'knn__k': 10},-156600.263407,-162552.81559,-156013.732015,-158388.937004,2954.027582,6


In [14]:
pred = grid_search.predict(x_test)

In [15]:
# Test RMSE
np.sqrt(mean_squared_error(y_test, pred))

317.54669349889843

In [16]:
y_test.mean()

69498.81383333333