In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

Multiclassification problem with 3 species of penguin - but the classes are imbalanced (Chinstrap penguin has ~1/2 samples compared to others)

In [4]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

Input features need scaling

In [6]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [9]:
from sklearn.model_selection import cross_validate

results = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')

In [11]:
results['test_score'].mean()

0.9521978021978021

In [12]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [14]:
model.set_params(classifier__n_neighbors = 5)
results_5nn = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')

In [17]:
model.set_params(classifier__n_neighbors = 51)
results_51nn = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')

model.set_params(classifier__n_neighbors = 101)
results_101nn = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')

model.set_params(classifier__n_neighbors = 5,  preprocessor__with_mean= False,
 preprocessor__with_std= False)
results_5nn_no_scale = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')



Evaluating models

In [26]:
print(f"{sum([i>0 for i in results_5nn['test_score'] - results_51nn['test_score']])}" 
"/10 test scores higher for 5nn vs 51nn")

4/10 test scores higher for 5nn vs 51nn


In [27]:
print(f"{sum([i>0 for i in results_5nn['test_score'] - results_101nn['test_score']])}" 
"/10 test scores higher for 5nn vs 101nn")

10/10 test scores higher for 5nn vs 51nn


In [29]:
print(f"{sum([i>0 for i in results_5nn['test_score'] - results_5nn_no_scale['test_score']])}" 
"/10 test scores higher for 5nn scaled vs 5nn raw")

10/10 test scores higher for 5nn scaled vs 5nn raw


In [30]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [31]:
from sklearn.model_selection import GridSearchCV

n_neighbour_params = [5,51,101]

param_grid = {'preprocessor' : all_preprocessors, 
                'classifier__n_neighbors': n_neighbour_params}

model_grid_search = GridSearchCV(model, param_grid=param_grid, 
                                n_jobs=-1, 
                                cv=10, 
                                scoring='balanced_accuracy')

In [32]:
model_grid_search.fit(data, target)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        StandardScaler(with_mean=False,
                                                       with_std=False)),
                                       ('classifier', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]},
             scoring='balanced_accuracy')

In [37]:
results_df = pd.DataFrame(model_grid_search.cv_results_)

In [39]:
results_df.sort_values(
    "mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.008198,0.003805,0.011364,0.009257,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
2,0.00871,0.004947,0.009225,0.002345,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
3,0.012802,0.008006,0.008811,0.001337,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
4,0.024206,0.005899,0.010977,0.006827,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
6,0.008265,0.00275,0.009316,0.000945,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5
8,0.013446,0.005645,0.009221,0.001841,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.857143,0.952381,1.0,0.863248,0.904762,0.904762,0.977778,0.930159,0.930159,0.952381,0.927277,0.043759,6
9,0.023205,0.004059,0.009114,0.001047,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.977778,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.922833,0.047883,7
7,0.009036,0.003372,0.011025,0.004838,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.952381,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.920293,0.045516,8
11,0.007359,0.000709,0.009561,0.001182,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.952381,0.944444,0.863248,0.834921,0.857143,0.834921,0.88254,0.834921,0.904762,0.876642,0.041618,9
12,0.008583,0.003765,0.01059,0.002115,101,MinMaxScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.857143,0.944444,0.863248,0.834921,0.857143,0.765079,0.904762,0.834921,0.904762,0.862357,0.046244,10


In [67]:
cv_cols = [col for col in results_df.columns if 'split' in col]

In [84]:
nn5_mask = results_df['param_classifier__n_neighbors'] == 5 
nn51_mask = results_df['param_classifier__n_neighbors'] == 51 
nn101_mask = results_df['param_classifier__n_neighbors'] == 101
ss_mask = results_df['param_preprocessor'].apply(str) == 'StandardScaler()'

diff_5nn_51_nn = results_df[nn5_mask & ss_mask][cv_cols].iloc[0] - results_df[nn51_mask & ss_mask][cv_cols].iloc[0]
diff_5nn_51_nn.apply(lambda x:x>0).sum()

4

In [86]:
diff_51nn_101_nn = results_df[nn51_mask & ss_mask][cv_cols].iloc[0] - results_df[nn101_mask & ss_mask][cv_cols].iloc[0]
diff_51nn_101_nn.apply(lambda x:x>0).sum()

9

In [87]:
cv_results = cross_validate(model_grid_search, data, target, 
                            return_estimator=True, 
                            cv=10, 
                            scoring="balanced_accuracy")

In [89]:
cv_results_df = pd.DataFrame(cv_results)

In [92]:
cv_results_df['test_score'].mean()

0.9426495726495727

In [96]:
cv_results_df['estimator'].apply(lambda x:x.best_params_.values())

0    (5, QuantileTransformer(n_quantiles=100))
1    (5, QuantileTransformer(n_quantiles=100))
2                        (5, StandardScaler())
3                        (5, StandardScaler())
4                          (5, MinMaxScaler())
5    (5, QuantileTransformer(n_quantiles=100))
6                          (5, MinMaxScaler())
7                        (5, StandardScaler())
8                        (5, StandardScaler())
9    (5, QuantileTransformer(n_quantiles=100))
Name: estimator, dtype: object