In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
# load a file that uses , as delimiter
def load_file(path, names):
    if not path.is_file():
        raise FileNotFoundError(str(path))
    data = pd.read_csv(path, sep=",", names=names, header=None)
    return data

# load data for hmm
def load_df():
    cols = ["Fixed Acidity", "Volatile Acidity","Citric Acid", "Residual Sugar", "Chlorides", "Free Sulfur Dioxide", "Total Sulfur Dioxide", "Density","pH", "Sulphates", "Alcohol" , "Quality"]
    file = Path.cwd() / "wine_quality.csv"
    return load_file(file, cols)

# removes any rows that have empty data slots ie null
def remove_empty_rows(data):
    data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
    return data
#removes outliers based on zscore greater than 3
def remove_outliers_Zscore(data):
    z = np.abs(stats.zscore(data))
    data_df_z_out =data[(z<3).all(axis=1)]
    return data_df_z_out
#romves outliers based on IQR
def remove_outliers_IQR(data):
    Q1 = data.quantile(.25)
    Q3 = data.quantile(.75)
    IQR = Q3-Q1
    data_df_IQR_out = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
    return data_df_IQR_out
    
#graphs histogram
def plotHistogram(data):
    data.hist(bins=50, color='steelblue', edgecolor='black', linewidth=1.0,
        xlabelsize=8, ylabelsize=8, grid=False)    
    return plt.tight_layout(rect=(0, 0, 1.2, 1.2)) 

#makes a 2d graph comparing all pairs of features in dataframe
def plot2dScatter(data):
    return sns.pairplot(data,diag_kind="kde")

    

In [3]:
data = load_df()

In [4]:
#uncomment to see histogram of data
#hist = plotHistogram(data)
print("Current datasize before cleaning" , data.shape[0])
data = remove_empty_rows(data)


#uncomment to see data b4 cleaning in 2d graph
#plot2d = plot2dScatter(data)
classes = data.groupby('Quality')
print("Number of classes before cleaning data" , classes.sum().shape[0])


print("Removing outliers using z-score")

#data = remove_outliers_IQR(data)
data = remove_outliers_Zscore(data)
print("Current data size after removing outliers", data.shape[0])

#Uncomment this to look at 2d plot of cleaned data
#plot2d = plot2dScatter(data)

classes = data.groupby('Quality')
print("Number of classes after cleaning data", classes.count().shape[0])


Current datasize before cleaning 4901
Number of classes before cleaning data 9
Removing outliers using z-score
Current data size after removing outliers 4558
Number of classes after cleaning data 5


In [5]:

X = data.drop(columns=["Quality"])
y = data["Quality"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

In [7]:
k = 4

In [8]:
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [9]:
model.predict(X_test)

array([6., 5., 7., 6., 6., 6., 6., 7., 7., 6., 6., 6., 6., 6., 6., 6., 6.,
       5., 5., 5., 6., 6., 5., 6., 6., 7., 7., 6., 5., 6., 6., 6., 5., 5.,
       6., 6., 7., 5., 5., 6., 5., 7., 5., 6., 6., 7., 5., 8., 6., 6., 6.,
       5., 6., 5., 5., 6., 6., 7., 6., 6., 6., 6., 5., 6., 6., 6., 5., 6.,
       5., 5., 7., 5., 5., 7., 6., 6., 6., 5., 5., 6., 5., 7., 5., 5., 5.,
       6., 6., 6., 7., 6., 7., 6., 6., 5., 5., 5., 5., 7., 5., 6., 5., 6.,
       7., 6., 5., 8., 5., 6., 6., 6., 6., 5., 6., 7., 5., 7., 6., 6., 6.,
       6., 6., 6., 5., 7., 5., 5., 5., 6., 6., 7., 6., 5., 6., 5., 5., 6.,
       7., 5., 5., 5., 6., 5., 5., 7., 5., 8., 5., 6., 5., 7., 5., 5., 6.,
       7., 5., 6., 6., 5., 6., 7., 7., 5., 5., 6., 6., 6., 7., 6., 6., 6.,
       6., 7., 5., 5., 7., 5., 4., 5., 6., 5., 7., 5., 6., 5., 7., 5., 5.,
       6., 6., 5., 6., 7., 6., 6., 8., 5., 7., 6., 7., 6., 5., 5., 6., 7.,
       7., 6., 5., 5., 5., 6., 6., 6., 6., 6., 5., 6., 6., 6., 8., 6., 6.,
       6., 5., 6., 6., 5.

In [10]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.727921009325288
Test score: 0.5756578947368421


In [11]:
score = cross_val_score(model, X_train, y_train, cv=5)
score

array([0.55068493, 0.57064472, 0.56241427, 0.53772291, 0.55281207])

In [12]:
np.arange(1, 26, 1) 

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25])

In [13]:
grid_params = {
    'n_neighbors': np.arange(1,26,1),
}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose=1, cv = 5, n_jobs=-1)

gs_results = gs.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    6.0s finished


In [14]:
gs_results.best_score_

0.6173889193636862

In [15]:
gs_results.best_params_

{'n_neighbors': 1}