In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import requests
import pandas as pd

In [None]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"
data = requests.get(URL)
with open("yeast.data", "wb") as f:
  f.write(data.content)

In [None]:
data = pd.read_fwf(
    'yeast.data',
    header=None
    )
data.columns = ['SEQUENCE_NAME', 'MCG', 'GVH', 'ALM', 'MIT', 'ERL', 'POX', 'VAC', 'NUC', 'CLASSES']

In [None]:
data = data[['MCG', 'GVH', 'ALM', 'MIT', 'ERL', 'POX', 'VAC', 'NUC', 'CLASSES']]

y = data['CLASSES']
X = data.drop(['CLASSES'], axis=1)

In [None]:
#split data into train+validation set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345, shuffle=True, stratify=y)
#split train+validation set into training and validation
X_trainval, X_valid, y_trainval, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=12345, shuffle=True, stratify=y_train)
print(f"Size of training set: {X_trainval.shape[0]}. \nSize of validation set: {X_valid.shape[0]}. \nSize of test set: {X_test.shape[0]}.")

Size of training set: 949. 
Size of validation set: 238. 
Size of test set: 297.


In [None]:
model_knn = KNeighborsClassifier()

In [None]:
param_grid = [{'n_neighbors': range(1, 38, 2), 'metric': ['euclidean']}]
grid_search = GridSearchCV(estimator=KNeighborsClassifier(),
                           param_grid=param_grid, scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro'],
                           refit='precision_macro',
                           cv=RepeatedStratifiedKFold(n_splits=5, 
                                                      n_repeats=10, random_state=12345)) #choose the refit parameter according to metric you want to tune

grid_search.fit(X_trainval, y_trainval)
y_true, y_pred = y_valid, grid_search.predict(X_valid)
print(classification_report(y_true, y_pred))
print(grid_search.best_estimator_)
print(grid_search.best_params_)

              precision    recall  f1-score   support

         CYT       0.50      0.59      0.54        74
         ERL       1.00      1.00      1.00         1
         EXC       0.50      0.33      0.40         6
         ME1       0.67      0.86      0.75         7
         ME2       0.67      0.75      0.71         8
         ME3       0.91      0.77      0.83        26
         MIT       0.65      0.56      0.60        39
         NUC       0.54      0.52      0.53        69
         POX       0.67      0.67      0.67         3
         VAC       0.00      0.00      0.00         5

    accuracy                           0.58       238
   macro avg       0.61      0.61      0.60       238
weighted avg       0.58      0.58      0.58       238

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')
{'metric': 'euclidean', 'n_neighbors': 7}


In [None]:
model_knn = KNeighborsClassifier(n_neighbors=7) #set the number of neighbors according to the hyperparameter found
scores_knn = cross_validate(model_knn, X_train, y_train, cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=12345),
                        scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro'],
                        return_train_score=True)
for k in scores_knn.keys():
    print(str(k) + ': Mean: ' + '{:.5f} - Standard Deviation: {:.5f}'.format(scores_knn[k].mean(),scores_knn[k].std()))

fit_time: Mean: 0.00343 - Standard Deviation: 0.00034
score_time: Mean: 0.01723 - Standard Deviation: 0.00135
test_accuracy: Mean: 0.57700 - Standard Deviation: 0.02523
train_accuracy: Mean: 0.67013 - Standard Deviation: 0.00851
test_precision_macro: Mean: 0.60144 - Standard Deviation: 0.04694
train_precision_macro: Mean: 0.66236 - Standard Deviation: 0.03248
test_recall_macro: Mean: 0.57635 - Standard Deviation: 0.04646
train_recall_macro: Mean: 0.64413 - Standard Deviation: 0.01049
test_f1_macro: Mean: 0.57181 - Standard Deviation: 0.04254
train_f1_macro: Mean: 0.63909 - Standard Deviation: 0.01925


In [None]:
pd.DataFrame(scores_knn).agg(['mean', 'std']).T

Unnamed: 0,mean,std
fit_time,0.003427,0.000345
score_time,0.017227,0.001363
test_accuracy,0.577,0.025484
train_accuracy,0.670135,0.008601
test_precision_macro,0.601437,0.047412
train_precision_macro,0.662356,0.032808
test_recall_macro,0.576353,0.046928
train_recall_macro,0.644126,0.010594
test_f1_macro,0.571811,0.042967
train_f1_macro,0.63909,0.01945
