In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("./data/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [6]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [7]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [14]:
from sklearn import datasets
X, y  = datasets.load_wine(return_X_y=True)

In [15]:
X.shape,y.shape

((178, 13), (178,))

In [19]:
# Check the label distribution
df_labels = pd.Series(y)
df_labels.value_counts()

1    71
0    59
2    48
dtype: int64

In [20]:
df_features = pd.DataFrame(X)
df_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [21]:
# Prepare train test
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 49, stratify = y)

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((124, 13), (54, 13), (124,), (54,))

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [28]:
knn = KNeighborsClassifier(5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [33]:
# Set average param for multi class
accuracy_score(y_test, y_pred), precision_score(y_test, y_pred,  average='weighted'), recall_score(y_test, y_pred, average='weighted')

(0.7592592592592593, 0.7528860028860028, 0.7592592592592593)

In [35]:
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()
hyper_params = {
    'n_neighbors': [5,7,10],
    'p': [1,2],
    'algorithm': ["auto", "ball_tree", "kd_tree", "brute"]
}

for folds in range(3,6):
    grid = GridSearchCV(
        knn,
        param_grid = hyper_params,
        scoring = 'accuracy',
        n_jobs =-1,
        cv = folds
    )
    
    grid.fit(X_train, y_train)
    score = grid.score(X_test, y_test)
    print("Folds ", folds, " Score: ", score, " best_params: ", grid.best_params_)

Folds  3  Score:  0.7962962962962963  best_params:  {'algorithm': 'auto', 'n_neighbors': 7, 'p': 1}
Folds  4  Score:  0.7962962962962963  best_params:  {'algorithm': 'auto', 'n_neighbors': 7, 'p': 1}
Folds  5  Score:  0.7777777777777778  best_params:  {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1}


In [36]:
knn_best_model = KNeighborsClassifier(n_neighbors=7, p=1)
knn_best_model.fit(X_train, y_train)
y_pred_best_model = knn_best_model.predict(X_test)
accuracy_score(y_test, y_pred_best_model), precision_score(y_test, y_pred_best_model,  average='weighted'), recall_score(y_test, y_pred_best_model, average='weighted')

(0.7962962962962963, 0.7958152958152959, 0.7962962962962963)