In [12]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


In [13]:
bos = pd.read_csv("Datasets/Boston.csv")
bos.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [17]:
X = bos.drop("medv", axis = 1)
y = bos["medv"]


What does shuffle = True do?
https://stackoverflow.com/a/12239443

eg kfold = KFold (n_splits = 5,
               shuffle = True,
               random_state = 2022)

If shuffle is True, the whole data is first shuffled and then split into the K-Folds. For repeatable behavior, you can set the random_state, for example to an integer seed (random_state=0). If your parameters depend on the shuffling, this means your parameter selection is very unstable. Probably you have very little training data or you use to little folds (like 2 or 3).

The "shuffle" is mainly useful if your data is somehow sorted by classes, because then each fold might contain only samples from one class (in particular for stochastic gradient decent classifiers sorted classes are dangerous). For other classifiers, it should make no differences. If shuffling is very unstable, your parameter selection is likely to be uninformative (aka garbage).


In [21]:
scaler = StandardScaler()
knn = KNeighborsRegressor()

pipe = Pipeline([("STD",scaler), ("KNN",knn)])
kfold = KFold (n_splits = 5,
               shuffle = True,
               random_state = 2022)
params = {"KNN__n_neighbors": np.arange(1,16)}

In [22]:
knn = KNeighborsRegressor()
gcv = GridSearchCV(pipe, param_grid = params, scoring = "r2", cv = kfold)
gcv.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=2022, shuffle=True),
             estimator=Pipeline(steps=[('STD', StandardScaler()),
                                       ('KNN', KNeighborsRegressor())]),
             param_grid={'KNN__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])},
             scoring='r2')

In [24]:
print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 2}
0.7811680749769405
