In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor as knn
import pandas as pd
import random
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import GridSearchCV
random.seed(0)
np.random.seed(0)
np.seterr(all='warn')
warnings.filterwarnings('ignore')

In [2]:
Data=pd.read_csv('data.csv')
features = Data.drop('I', axis=1).values

In [3]:
scale = RobustScaler()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    Data['I'].values,
                                                    test_size=0.15,
                                                    random_state=0,
                                                    shuffle=True)

In [5]:
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)


In [6]:
k_range = list(range(1, 5))
param_grid = dict(n_neighbors=k_range,weights = ['distance']
,metric = ['minkowski']
,p = [1,2,3,4,5,6,7],algorithm=['kd_tree','ball_tree','brute'])
model =knn()
gs = GridSearchCV(model,param_grid=param_grid,scoring='neg_mean_squared_error'
                  ,verbose=1,n_jobs=12,cv=13)
gs.fit(X_train,y_train)

Fitting 13 folds for each of 84 candidates, totalling 1092 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 740 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done 1092 out of 1092 | elapsed:    4.5s finished


GridSearchCV(cv=13, estimator=KNeighborsRegressor(), n_jobs=12,
             param_grid={'algorithm': ['kd_tree', 'ball_tree', 'brute'],
                         'metric': ['minkowski'], 'n_neighbors': [1, 2, 3, 4],
                         'p': [1, 2, 3, 4, 5, 6, 7], 'weights': ['distance']},
             scoring='neg_mean_squared_error', verbose=1)

In [7]:
gs.best_estimator_

KNeighborsRegressor(algorithm='brute', n_neighbors=4, p=4, weights='distance')

In [8]:
abs(gs.score(X_test,y_test))

2.6728772870151094e-11

In [9]:
fina_model = knn(algorithm='brute',leaf_size=30,metric='minkowski',p=4,n_neighbors=4,weights='distance')
fina_model.fit(X_train,y_train)

KNeighborsRegressor(algorithm='brute', n_neighbors=4, p=4, weights='distance')

In [10]:
print('R^2 score = ', r2(y_test,fina_model.predict(X_test)))

R^2 score =  0.9995901070375562


In [11]:
print('mean squared error on testing data = ', mse(y_test,fina_model.predict(X_test)))

mean squared error on testing data =  2.6728772870151094e-11


In [12]:
print('mean absolute error on testing data = ', mae(y_test,fina_model.predict(X_test)))

mean absolute error on testing data =  2.670075782006565e-06


In [13]:
scores = cross_val_score(X=X_train,y=y_train,estimator=fina_model,cv=13,scoring='neg_mean_squared_error')

In [15]:
abs(scores.mean())

6.070675818339898e-11