### import packages

In [63]:
from sklearn import datasets, linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error

### load datasets

In [11]:
diabetes = datasets.load_diabetes()
diabetes.feature_names, diabetes.data.shape

(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], (442, 10))

### prepare training dataset

In [34]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)

In [35]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

In [36]:
X_train

array([[-0.05273755,  0.05068012, -0.01159501, ...,  0.07120998,
         0.03056649, -0.0052198 ],
       [-0.04547248, -0.04464164,  0.01535029, ..., -0.00259226,
        -0.10436482, -0.07563562],
       [-0.00551455,  0.05068012,  0.00133873, ..., -0.03949338,
        -0.04118039, -0.08806194],
       ...,
       [ 0.07076875,  0.05068012, -0.03099563, ..., -0.03949338,
        -0.01495648, -0.0010777 ],
       [-0.00914709, -0.04464164, -0.01590626, ..., -0.00259226,
        -0.03324879,  0.04862759],
       [-0.07453279, -0.04464164, -0.02345095, ..., -0.03949338,
        -0.03845911, -0.03007245]])

In [37]:
max(y_train.tolist()), min(y_train.tolist())

(346.0, 25.0)

In [25]:
pd.DataFrame(data=diabetes.data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041180,-0.096346
6,-0.045472,0.050680,-0.047163,-0.015999,-0.040096,-0.024800,0.000779,-0.039493,-0.062913,-0.038357
7,0.063504,0.050680,-0.001895,0.066630,0.090620,0.108914,0.022869,0.017703,-0.035817,0.003064
8,0.041708,0.050680,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.014956,0.011349
9,-0.070900,-0.044642,0.039062,-0.033214,-0.012577,-0.034508,-0.024993,-0.002592,0.067736,-0.013504


### training with Regressor

In [26]:
linear_regressor = linear_model.LinearRegression()
knn_regressor = KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto')

In [41]:
linear_regressor.fit(X=X_train, y=y_train)
knn_regressor.fit(X=X_train, y=y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

### testing

In [50]:
linear_y_pred = linear_regressor.predict(X=X_test)
linear_y_pred

array([170.15146419, 180.81620528,  91.88466475, 175.27015492,
       119.75030041,  97.19782138, 256.18067486, 230.83202184,
        80.46622445,  84.11020947, 150.10165269, 144.07443601,
       170.49706475, 204.06316143,  96.67720719, 124.57487166,
       155.24029231, 198.01074672,  54.4506207 , 175.94728688,
       193.07539146, 139.94746512, 147.48737068,  96.99411642,
        42.60853448, 165.13814884, 221.61008247, 116.62195991,
       163.62447966, 256.07256886, 137.94584295, 113.02815472,
       172.92643483, 278.54775054, 198.49032416,  56.6163163 ,
       162.97184736, 231.44704341, 218.21034143, 132.70843985,
       104.91898028, 116.62566704,  71.95179704, 212.9110832 ,
       178.70028831, 155.24844853, 119.78902581, 115.21470058,
        83.84959119, 104.77244136, 136.88972836, 177.01299952,
        84.11902847, 142.30074587, 261.49878463, 153.560274  ,
       151.26454894,  51.56798634, 202.03800021, 131.2387866 ,
       260.18604997, 115.52114954,  61.04841985, 187.80

In [44]:
knn_y_pred = knn_regressor.predict(X=X_test)
knn_y_pred

array([165.2, 133.6, 119.2, 172.2, 107. , 129.2, 238.8, 184.6,  80.4,
        76.8, 126.8, 161.6, 128. , 107.8, 122.2,  79. , 125.2, 191.6,
        99. , 119.8, 140. , 152.6, 103.4, 155. ,  75.8, 134.4, 205.4,
       126. , 120.4, 275.4, 135.2, 152.6, 219. , 249.4, 148.2,  86.8,
       153. , 245.6, 220. , 118.2, 112.2,  82.4,  89.4, 198.4, 139.2,
       170.6, 147. ,  78. ,  87. , 120. , 192.6, 157.8, 100.4,  61.6,
       267. , 117.2, 209.8,  84. , 143. , 141.2, 219.6, 158.4,  82.4,
       165.8, 127.8, 153.6,  78.8,  82. , 220.8,  80.6, 163.8, 172.2,
       155.8,  86.8, 132.2, 172.2, 134.4,  70.8, 108. , 145. , 225.2,
       191.6, 126.8, 194.4, 127.2,  85. , 132. , 221. , 159.6])

In [58]:
np.sum(np.sqrt(np.square(np.abs(y_test - linear_y_pred))))/len(y_test), np.sum(np.sqrt(np.square(np.abs(y_test - knn_y_pred))))/len(y_test)

(44.95038716551058, 44.570786516853936)

In [68]:
mean_squared_log_error(y_true=y_test, y_pred=linear_y_pred), mean_squared_log_error(y_true=y_test, y_pred=knn_y_pred)

(0.1554011303456353, 0.1618959539473467)

In [73]:
plt.figure(figsize=(8,8))
# plt.plot(y_test)
# plt.plot(linear_y_pred)
# plt.plot(knn_y_pred)
# plt.legend()

<Figure size 576x576 with 0 Axes>

<Figure size 576x576 with 0 Axes>

### Grid Search

In [69]:
from itertools import combinations_with_replacement

In [75]:
limit  = 100
n_features = 10
w = np.random.uniform(-limit, limit, (n_features,)) # l2
np.linalg.norm(w), w

(182.67985570994045,
 array([ 35.91504203, -60.74970758,   4.89536754,  97.72885716,
        -76.59512552,  69.60580005,  32.17805957, -73.39773997,
        -41.00288089,   0.97245953]))