Activate the venv with
> source venv/bin/activate // to activate venv for terminal session
> jupyter notebook // to start the jupyter notebook

In [3]:
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

X,y = load_iris(return_X_y=True, as_frame=True)

# return as (x,y) not as a bunch object, and returns as a pd dataframe

In [4]:
skl.set_config(print_changed_only=False)
# default config settings now changed, st False setting is needed for parameters to show in pipe.get_params()

At this point you can inspect the data of X and y

In [5]:
pipe = Pipeline([("scale", StandardScaler()),
                 ("model", KNeighborsClassifier(n_neighbors=1))
])

# build a ML pipeline with the necessary steps
# run pipe.get_params() for subsequent steps

pipe.get_params(deep=True)

{'memory': None,
 'steps': [('scale', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('model',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                        metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                        weights='uniform'))],
 'verbose': False,
 'scale': StandardScaler(copy=True, with_mean=True, with_std=True),
 'model': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                      weights='uniform'),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 1,
 'model__p': 2,
 'model__weights': 'uniform'}

In [6]:
X = X.to_numpy() # convert df to numpy for skl model to work with
y = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [7]:
# GridSearch implements cross-validation
model = GridSearchCV(estimator=pipe,
                     param_grid={'model__n_neighbors':[i for i in range(1,10)]},
                     cv=5)

In [8]:
model.fit(X_train, y_train)

In [10]:
pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000511,0.000193,0.000883,0.000135,1,{'model__n_neighbors': 1},0.958333,0.916667,0.833333,0.958333,0.958333,0.925,0.048591,5
1,0.000445,5e-05,0.000889,9.2e-05,2,{'model__n_neighbors': 2},0.875,0.875,0.833333,0.958333,0.916667,0.891667,0.042492,9
2,0.000368,2.7e-05,0.000781,0.000123,3,{'model__n_neighbors': 3},0.833333,0.916667,0.875,0.958333,0.958333,0.908333,0.048591,8
3,0.000377,6.7e-05,0.000871,0.000206,4,{'model__n_neighbors': 4},0.916667,0.916667,0.833333,0.958333,0.958333,0.916667,0.045644,7
4,0.000426,9e-05,0.000782,6.3e-05,5,{'model__n_neighbors': 5},0.875,1.0,0.875,0.958333,0.958333,0.933333,0.05,1
5,0.00044,7.5e-05,0.000828,5.5e-05,6,{'model__n_neighbors': 6},0.875,1.0,0.833333,0.958333,0.958333,0.925,0.061237,5
6,0.000364,2.8e-05,0.000939,0.00038,7,{'model__n_neighbors': 7},0.875,1.0,0.833333,0.958333,1.0,0.933333,0.0677,1
7,0.000345,9e-06,0.000724,1.7e-05,8,{'model__n_neighbors': 8},0.875,1.0,0.833333,0.958333,1.0,0.933333,0.0677,1
8,0.000437,5.2e-05,0.000862,8.4e-05,9,{'model__n_neighbors': 9},0.875,1.0,0.833333,0.958333,1.0,0.933333,0.0677,1
