In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [3]:
target.head()

0    Adelie Penguin (Pygoscelis adeliae)
1    Adelie Penguin (Pygoscelis adeliae)
2    Adelie Penguin (Pygoscelis adeliae)
4    Adelie Penguin (Pygoscelis adeliae)
5    Adelie Penguin (Pygoscelis adeliae)
Name: Species, dtype: object

In [4]:
target.unique()

array(['Adelie Penguin (Pygoscelis adeliae)',
       'Gentoo penguin (Pygoscelis papua)',
       'Chinstrap penguin (Pygoscelis antarctica)'], dtype=object)

In [5]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [6]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [8]:
from sklearn.model_selection import cross_validate
accuracy = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")

In [9]:
accuracy['test_score'].mean()

0.9521978021978021

In [10]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [11]:
accuracy['test_score']

array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
       0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])

In [12]:
model.set_params(classifier__n_neighbors=51)
accuracy_51 = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
accuracy_51['test_score']

array([0.95238095, 0.97777778, 1.        , 0.86324786, 0.88253968,
       0.95238095, 0.95555556, 0.95238095, 0.93015873, 0.95238095])

In [13]:
model.set_params(classifier__n_neighbors=101)
accuracy_101 = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
accuracy_101['test_score']

array([0.85714286, 0.95238095, 0.94444444, 0.86324786, 0.83492063,
       0.85714286, 0.83492063, 0.88253968, 0.83492063, 0.9047619 ])

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [15]:
model = Pipeline([("preprocessor", None),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [16]:
param_grid = {
    'classifier__learning_rate': (0.05, 0.5),
    'classifier__max_leaf_nodes': (10, 30),
}

In [17]:
from sklearn.model_selection import GridSearchCV
model_grid = GridSearchCV(model, param_grid={"preprocessor": all_preprocessors, "classifier__n_neighbors": [5, 51, 101]}, 
                          cv=10)

In [18]:
model_grid.fit(data, target)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor', None),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]})

In [19]:
cv_results = pd.DataFrame(model_grid.cv_results_)

In [20]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001192,0.0004020658,0.001304,0.000465,5,,"{'classifier__n_neighbors': 5, 'preprocessor':...",0.742857,0.8,0.794118,0.794118,0.647059,0.764706,0.882353,0.794118,0.911765,0.852941,0.798403,0.070751,13
1,0.001699,0.0004572472,0.001301,0.000457,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.941176,0.911765,0.970588,0.970588,0.941176,0.911765,0.970588,0.961765,0.032353,1
2,0.0016,0.0004901645,0.0013,0.000458,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.971429,1.0,0.970588,0.911765,0.941176,0.941176,0.970588,0.911765,0.970588,0.958908,0.030028,3
3,0.002,9.760613e-07,0.0014,0.000489,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.971429,0.942857,1.0,0.941176,0.941176,1.0,0.970588,0.941176,0.911765,0.970588,0.959076,0.026888,2
4,0.003196,0.0004027681,0.001304,0.00046,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.941176,0.911765,1.0,0.955966,0.035452,5
5,0.001004,1.168533e-05,0.001295,0.000461,51,,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.742857,0.685714,0.735294,0.705882,0.647059,0.764706,0.735294,0.735294,0.764706,0.764706,0.728151,0.036402,15
6,0.001496,0.0005000241,0.001401,0.00049,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.971429,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.970588,0.941176,0.970588,0.95605,0.027209,4
7,0.001801,0.0003875573,0.001597,0.000488,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.942857,0.971429,1.0,0.911765,0.882353,0.970588,0.911765,0.970588,0.941176,0.941176,0.94437,0.033461,7
8,0.002003,1.245987e-05,0.001597,0.000488,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.914286,0.971429,1.0,0.911765,0.941176,0.941176,0.970588,0.941176,0.941176,0.970588,0.950336,0.026181,6
9,0.003,1.655252e-05,0.001699,0.000457,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.942857,0.971429,1.0,0.911765,0.882353,0.970588,0.911765,0.970588,0.941176,0.941176,0.94437,0.033461,7


In [21]:
cv_results_eval = cross_validate(
    model_grid, data, target, cv=10, n_jobs=2, return_estimator=True
)

In [22]:
cv_results_eval = pd.DataFrame(cv_results_eval)
cv_test_scores = cv_results_eval['test_score']
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} +/- {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.950 +/- 0.034


In [23]:
for cv_fold, estimator_in_fold in enumerate(cv_results_eval["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #2:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #3:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #4:
{'classifier__n_neighbors': 51, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #5:
{'classifier__n_neighbors': 51, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #6:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #7:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #8:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #9:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #10:
{'classifier__