In [1]:
import pandas as pd
from sklearn import model_selection as ms
from sklearn import neighbors as nb
from sklearn import preprocessing as pc

In [2]:
col_names = ['class'] + [str(i) for i in range(1, 14)]
ds = pd.read_csv('data/wine.data', sep=',', names=col_names)
ds

Unnamed: 0,class,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


### Найдём точность классификации на кросс-валидации и оптимальное k без нормировки признаков

In [3]:
X = ds.loc[:, ds.columns != 'class']
#print(type(X))
y = ds['class']
#print(type(y))
cv = ms.KFold(n_splits=5, shuffle=True, random_state=42)

def get_classification_accuracy(X: pd.DataFrame, y: pd.Series, cv) -> tuple[float, int]:
    best_score, opt_k = None, None
    for k in range(1, 51):
        model = nb.KNeighborsClassifier(n_neighbors=k)
        score = ms.cross_val_score(estimator=model, X=X, y=y, cv=cv, scoring="accuracy").mean()
        if best_score is None or score >= best_score:
            best_score, opt_k = score, k
    return best_score, opt_k

score, k = get_classification_accuracy(X, y, cv)

f"{score:.2f}", k

('0.73', 1)

### Приведем признаки к одному масштабу

In [4]:
scale_score, scale_k = get_classification_accuracy(pc.scale(X), y, cv)
f"{scale_score:.2f}", scale_k

('0.98', 29)