In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
from scipy import stats
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
# load a file that uses , as delimiter
def load_file(path, names):
    if not path.is_file():
        raise FileNotFoundError(str(path))
    data = pd.read_csv(path, sep=",", names=names, header=None)
    return data

# load data for hmm
def load_df():
    cols = ["Fixed Acidity", "Volatile Acidity","Citric Acid", "Residual Sugar", "Chlorides", "Free Sulfur Dioxide", "Total Sulfur Dioxide", "Density","pH", "Sulphates", "Alcohol" , "Quality"]
    file = Path.cwd() / "wine_quality.csv"
    return load_file(file, cols)

# removes any rows that have empty data slots ie null
def remove_empty_rows(data):
    data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
    return data

def remove_outliers(data):
    z = np.abs(stats.zscore(data))
    return data[(z<3).all(axis=1)]

In [4]:
data = load_df()
#sns.boxplot(x=data["Fixed Acidity"])
data

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,Quality
0,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6.0
1,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
2,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
3,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6.0
4,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4896,4.9,0.235,0.27,11.75,0.030,34.0,118.0,0.99540,3.07,0.50,9.4,6.0
4897,6.1,0.340,0.29,2.20,0.036,25.0,100.0,0.98938,3.06,0.44,11.8,6.0
4898,5.7,0.210,0.32,0.90,0.038,38.0,121.0,0.99074,3.24,0.46,10.6,6.0
4899,6.5,0.230,0.38,1.30,0.032,29.0,112.0,0.99298,3.29,0.54,9.7,5.0


In [5]:
data = remove_empty_rows(data)
data.shape

(4900, 12)

In [6]:
data = remove_outliers(data)
data.shape

(4558, 12)

In [7]:
X = data.drop(columns=["Quality"])
y = data["Quality"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

In [9]:
k = 4

In [10]:
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [11]:
model.predict(X_test)

array([5., 6., 6., 7., 7., 7., 7., 5., 7., 5., 7., 5., 7., 5., 6., 6., 4.,
       6., 6., 5., 6., 7., 5., 7., 6., 7., 5., 5., 6., 6., 6., 5., 5., 5.,
       5., 5., 6., 5., 6., 6., 6., 6., 6., 5., 5., 6., 6., 5., 5., 6., 6.,
       6., 7., 6., 5., 7., 5., 6., 6., 6., 6., 6., 7., 5., 5., 5., 6., 6.,
       6., 6., 5., 6., 8., 7., 5., 8., 4., 6., 7., 5., 5., 6., 5., 7., 5.,
       6., 6., 5., 5., 6., 8., 6., 4., 5., 5., 5., 6., 5., 5., 7., 5., 6.,
       6., 5., 5., 6., 5., 6., 5., 6., 5., 7., 5., 6., 6., 6., 4., 5., 7.,
       5., 6., 5., 5., 5., 6., 5., 5., 6., 5., 6., 7., 6., 5., 7., 6., 5.,
       7., 5., 6., 5., 5., 6., 7., 6., 5., 5., 5., 5., 5., 6., 6., 5., 5.,
       7., 5., 6., 5., 6., 7., 5., 6., 6., 5., 4., 5., 6., 7., 5., 6., 6.,
       5., 6., 7., 6., 6., 6., 6., 6., 5., 6., 5., 6., 5., 5., 6., 6., 5.,
       7., 6., 6., 7., 4., 5., 5., 4., 6., 5., 5., 5., 5., 6., 6., 6., 7.,
       6., 5., 5., 7., 6., 6., 6., 6., 5., 6., 5., 7., 7., 8., 7., 6., 5.,
       6., 5., 5., 6., 5.

In [12]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.7174986286341196
Test score: 0.5723684210526315


In [13]:
score = cross_val_score(model, X_train, y_train, cv=5)
score

array([0.55479452, 0.53223594, 0.5473251 , 0.55144033, 0.54458162])

In [63]:
np.arange(1, 26, 1) 

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25])

In [64]:
grid_params = {
    'n_neighbors': np.arange(1,26,1),
}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose=1, cv = 5, n_jobs=-1)

gs_results = gs.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    4.8s finished


In [65]:
gs_results.best_score_

0.6220515633571037

In [66]:
gs_results.best_params_

{'n_neighbors': 1}