In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
target.value_counts()

Species
Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: count, dtype: int64

In [3]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [5]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(
    model,
    data,
    target,
    cv=10,
    scoring="balanced_accuracy"
)

print(cv_results['test_score'])
cv_results['test_score'].mean()

[1.         1.         1.         0.91880342 0.88253968 0.95238095
 0.97777778 0.93015873 0.90793651 0.95238095]


0.9521978021978021

In [7]:

model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [9]:
model.set_params(classifier__n_neighbors=51)
cv_results = cross_validate(
    model,
    data,
    target,
    cv=10,
    scoring="balanced_accuracy"
)

print(cv_results['test_score'])
cv_results['test_score'].mean()

[0.95238095 0.97777778 1.         0.86324786 0.88253968 0.95238095
 0.95555556 0.95238095 0.93015873 0.95238095]


0.9418803418803419

In [11]:
model.set_params(classifier__n_neighbors=101)
cv_results = cross_validate(
    model,
    data,
    target,
    cv=10,
    scoring="balanced_accuracy"
)

print(cv_results['test_score'])
cv_results['test_score'].mean()

[0.85714286 0.95238095 0.94444444 0.86324786 0.83492063 0.85714286
 0.83492063 0.88253968 0.83492063 0.9047619 ]


0.8766422466422465

In [12]:
model.set_params(preprocessor=StandardScaler(), classifier__n_neighbors=51)
cv_results_ss_51 = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy"
)
cv_results_ss_51["test_score"].mean(), cv_results_ss_51["test_score"].std()
      
print(
    "5-NN is strictly better than 51-NN for "
    f"{sum(cv_results_ss_5['test_score'] > cv_results_ss_51['test_score'])}"
    " CV iterations out of 10."
  )

model.set_params(preprocessor=StandardScaler(), classifier__n_neighbors=101)
cv_results_ss_101 = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy"
)
cv_results_ss_101["test_score"].mean(), cv_results_ss_101["test_score"].std()

print(
    "5-NN is strictly better than 101-NN for "
    f"{sum(cv_results_ss_5['test_score'] > cv_results_ss_101['test_score'])}"
    "CV iterations out of 10."
  )


model.set_params(preprocessor=None, classifier__n_neighbors=5)
cv_results_none_5 = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy"
)
cv_results_none_5["test_score"].mean(), cv_results_none_5["test_score"].std()

print(
    "NN with scaling is better NN without scaling for "
    f"{sum(cv_results_ss_5['test_score'] > cv_results_none_5['test_score'])}"
    "CV iterations out of 10."
  )

NameError: name 'cv_results_ss_5' is not defined

In [10]:
# Grid search
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

param_grid= {
        "preprocessor": all_preprocessors, 
        "classifier__n_neighbors": [5, 51, 101]
    }

model_grid_search = GridSearchCV(
    model, 
    param_grid=param_grid,
    cv=10,
    n_jobs=2,
    scoring='balanced_accuracy'
)
model_grid_search.fit(data, target)

model_grid_search.best_params_

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

In [11]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False
)
cv_results.head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.004046,0.000183,0.005663,0.000341,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
2,0.003792,0.000128,0.005508,0.000122,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
3,0.005308,0.000174,0.005779,0.000174,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
4,0.007845,0.000558,0.005939,0.000282,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
6,0.003923,0.000271,0.005629,0.000204,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5


In [12]:
# get the parameter names
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += ["mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,param_preprocessor,n_neighbors,mean_test_score,std_test_score,rank_test_score
1,StandardScaler(),5,0.952198,0.039902,1
2,MinMaxScaler(),5,0.947778,0.034268,2
3,QuantileTransformer(n_quantiles=100),5,0.947094,0.033797,3
4,PowerTransformer(method='box-cox'),5,0.94696,0.047387,4
6,StandardScaler(),51,0.94188,0.038905,5
8,QuantileTransformer(n_quantiles=100),51,0.927277,0.043759,6
9,PowerTransformer(method='box-cox'),51,0.922833,0.047883,7
7,MinMaxScaler(),51,0.920293,0.045516,8
11,StandardScaler(),101,0.876642,0.041618,9
12,MinMaxScaler(),101,0.862357,0.046244,10


In [15]:
cv_results = cross_validate(
    model_grid_search,
    data,
    target,
    return_estimator=True,
    cv=10,
    scoring='balanced_accuracy'
)



In [19]:
cv_results['test_score'].mean()

0.9426495726495727

In [25]:
cv_results['estimator']

[GridSearchCV(cv=10,
              estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                        ('classifier', KNeighborsClassifier())]),
              n_jobs=2,
              param_grid={'classifier__n_neighbors': [5, 51, 101],
                          'preprocessor': [None, StandardScaler(),
                                           MinMaxScaler(),
                                           QuantileTransformer(n_quantiles=100),
                                           PowerTransformer(method='box-cox')]},
              scoring='balanced_accuracy'),
 GridSearchCV(cv=10,
              estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                        ('classifier', KNeighborsClassifier())]),
              n_jobs=2,
              param_grid={'classifier__n_neighbors': [5, 51, 101],
                          'preprocessor': [None, StandardScaler(),
                                           MinMaxScaler(),
  

In [26]:
cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results["test_score"]
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.943 ± 0.038


In [27]:
for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #2:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #3:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #4:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #5:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #6:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #7:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #8:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #9:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #10:
{'classifier__n_ne