In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

Inspect the target variable and select the correct assertion.

In [2]:
target.value_counts(normalize=True)

Adelie Penguin (Pygoscelis adeliae)          0.441520
Gentoo penguin (Pygoscelis papua)            0.359649
Chinstrap penguin (Pygoscelis antarctica)    0.198830
Name: Species, dtype: float64

The problem to be solved is a multiclass classification problem (more than two possible classes).

The proportions of the classes are imbalanced.

In [3]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


Input features have different scales. 

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [5]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')

In [6]:
cv_res_df = pd.DataFrame(cv_results)

cv_res_df.rename(columns={'test_score': 'test_score_n5'}, inplace=True)

cv_res_df

Unnamed: 0,fit_time,score_time,test_score_n5
0,0.004124,0.003824,1.0
1,0.003905,0.003464,1.0
2,0.003919,0.003549,1.0
3,0.003507,0.003619,0.918803
4,0.003472,0.00349,0.88254
5,0.003568,0.00345,0.952381
6,0.004356,0.003541,0.977778
7,0.00347,0.003486,0.930159
8,0.003721,0.00334,0.907937
9,0.003873,0.003736,0.952381


In [7]:
print(f'The average cross validation test score balanced accuracy: {cv_res_df.test_score_n5.mean():.2f} +/- {cv_res_df.test_score_n5.std():.2f}')

The average cross validation test score balanced accuracy: 0.95 +/- 0.04


Repeat the evaluation by setting the parameters in order to select the correct statements in the list below. We recall that you can use model.get_params() to list the parameters of the pipeline and use model.set_params(param_name=param_value) to update them. Remember that one way to compare two models is comparing the cross-validation test scores of both models fold-to-fold, i.e. counting the number of folds where one model has a better test score than the other.

In [8]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [9]:
model.set_params(classifier__n_neighbors=51)

Pipeline(steps=[('preprocessor', StandardScaler()),
                ('classifier', KNeighborsClassifier(n_neighbors=51))])

In [10]:
cv_results = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')

In [11]:
pd.DataFrame(cv_results)['test_score']

0    0.952381
1    0.977778
2    1.000000
3    0.863248
4    0.882540
5    0.952381
6    0.955556
7    0.952381
8    0.930159
9    0.952381
Name: test_score, dtype: float64

In [12]:
cv_res_df = cv_res_df.join(pd.DataFrame(cv_results)['test_score']).rename(columns={'test_score': 'test_score_n51'})

In [13]:
model.set_params(classifier__n_neighbors=101)

Pipeline(steps=[('preprocessor', StandardScaler()),
                ('classifier', KNeighborsClassifier(n_neighbors=101))])

In [14]:
cv_results = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')

In [15]:
cv_res_df = cv_res_df.join(pd.DataFrame(cv_results)['test_score']).rename(columns={'test_score': 'test_score_n101'})

In [16]:
cv_results = cross_validate(KNeighborsClassifier(n_neighbors=5), data, target, cv=10, scoring='balanced_accuracy')

In [17]:
cv_res_df = cv_res_df.join(pd.DataFrame(cv_results)['test_score']).rename(columns={'test_score': 'test_score_no_scaling'})

In [18]:
cv_res_df

Unnamed: 0,fit_time,score_time,test_score_n5,test_score_n51,test_score_n101,test_score_no_scaling
0,0.004124,0.003824,1.0,0.952381,0.857143,0.664683
1,0.003905,0.003464,1.0,0.977778,0.952381,0.73602
2,0.003919,0.003549,1.0,1.0,0.944444,0.741026
3,0.003507,0.003619,0.918803,0.863248,0.863248,0.704274
4,0.003472,0.00349,0.88254,0.88254,0.834921,0.584127
5,0.003568,0.00345,0.952381,0.952381,0.857143,0.669841
6,0.004356,0.003541,0.977778,0.955556,0.834921,0.834921
7,0.00347,0.003486,0.930159,0.952381,0.88254,0.742857
8,0.003721,0.00334,0.907937,0.930159,0.834921,0.88254
9,0.003873,0.003736,0.952381,0.952381,0.904762,0.838095


In [19]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [20]:
from sklearn.model_selection import GridSearchCV


model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

param_grid = {'classifier__n_neighbors': [5, 51, 101],
               'preprocessor': all_preprocessors}

model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=10, scoring='balanced_accuracy')

In [21]:
model_grid_search.fit(data, target)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier', KNeighborsClassifier())]),
             n_jobs=2,
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]},
             scoring='balanced_accuracy')

In [22]:
results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    by="rank_test_score", ascending=True
)
# convert the name of the preprocessor for later display
results["param_preprocessor"] = results["param_preprocessor"].apply(
    lambda x: x.__class__.__name__ if x is not None else "None"
)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.003591,0.000203,0.003386,0.000257,5,StandardScaler,"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
2,0.003303,0.000155,0.003266,0.00024,5,MinMaxScaler,"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
3,0.004505,0.000163,0.003562,0.000283,5,QuantileTransformer,"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
4,0.008047,0.002558,0.003613,0.000556,5,PowerTransformer,"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
6,0.003435,0.000159,0.003551,0.000246,51,StandardScaler,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5
8,0.004329,0.000109,0.00367,0.000172,51,QuantileTransformer,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.857143,0.952381,1.0,0.863248,0.904762,0.904762,0.977778,0.930159,0.930159,0.952381,0.927277,0.043759,6
9,0.006297,0.000499,0.003306,8.5e-05,51,PowerTransformer,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.977778,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.922833,0.047883,7
7,0.003399,0.000164,0.003653,0.000164,51,MinMaxScaler,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.952381,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.920293,0.045516,8
11,0.003646,0.000126,0.004112,0.000158,101,StandardScaler,"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.952381,0.944444,0.863248,0.834921,0.857143,0.834921,0.88254,0.834921,0.904762,0.876642,0.041618,9
12,0.003348,0.000205,0.003799,0.000321,101,MinMaxScaler,"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.857143,0.944444,0.863248,0.834921,0.857143,0.765079,0.904762,0.834921,0.904762,0.862357,0.046244,10


In [23]:
reference_model = results.iloc[0]
other_models = results.iloc[1:4]
cv_score_columns = results.columns[results.columns.str.startswith("split")]
for idx, other_model in other_models.iterrows():
    score_reference_model = reference_model[cv_score_columns]
    score_other_model = other_model[cv_score_columns]
    print(
        f"{reference_model['param_classifier__n_neighbors']}-NN with "
        f"{reference_model['param_preprocessor']} is strictly better than "
        f"{other_model['param_classifier__n_neighbors']}-NN with "
        f"{other_model['param_preprocessor']} for "
        f"{sum(score_reference_model > score_other_model)} CV iterations "
        f"out of 10."
    )

5-NN with StandardScaler is strictly better than 5-NN with MinMaxScaler for 3 CV iterations out of 10.
5-NN with StandardScaler is strictly better than 5-NN with QuantileTransformer for 2 CV iterations out of 10.
5-NN with StandardScaler is strictly better than 5-NN with PowerTransformer for 3 CV iterations out of 10.


In [24]:
import numpy as np

reference_model = results.iloc[0][cv_score_columns]
other_model = results.iloc[4][cv_score_columns]
print(
    f"5-NN with StandardScaler is strictly better 51-NN with StandardScaler for "
    f"{np.sum(reference_model.to_numpy() > other_model.to_numpy())} "
    "CV iterations out of 10."
)

5-NN with StandardScaler is strictly better 51-NN with StandardScaler for 4 CV iterations out of 10.


In [25]:
reference_model = results.iloc[0][cv_score_columns]
other_model = results.iloc[8][cv_score_columns]
print(
    f"51-NN with StandardScaler is strictly better than 101-NN with StandardScaler for "
    f"{np.sum(reference_model.to_numpy() > other_model.to_numpy())} "
    "CV iterations out of 10."
)

51-NN with StandardScaler is strictly better than 101-NN with StandardScaler for 10 CV iterations out of 10.


In [26]:
nested_cv = cross_validate(model_grid_search, data, target,
                           cv=10, return_estimator=True, 
                           scoring='balanced_accuracy', n_jobs=2)

In [27]:
cv_results = pd.DataFrame(nested_cv)
cv_test_scores = cv_results['test_score']
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} +/- {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.943 +/- 0.038


In [28]:
for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #2:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #3:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #4:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #5:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #6:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #7:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #8:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #9:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #10:
{'classifier__n_ne