# Import Libraries

In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np

In [None]:
#import wine CSVs
red_csv = "Data/winequality-red.csv"
white_csv = "Data/winequality-white.csv"

In [None]:
red = pd.read_csv(red_csv)
white= pd.read_csv(white_csv)

red.head()

# Scale Red Wine Data 

In [None]:
#assign data to X and y
red_X = red[["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]]
red_y = red["quality"].values.reshape(-1, 1)
print(red_X.shape, red_y.shape)

In [None]:
#train test split
from sklearn.model_selection import train_test_split

redX_train, redX_test, redy_train, redy_test = train_test_split(red_X, red_y, stratify=red_y, random_state=42)

In [None]:
#scale Data
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
redX_scaler = StandardScaler().fit(redX_train)

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models
redX_train_scaled = redX_scaler.transform(redX_train)
redX_test_scaled = redX_scaler.transform(redX_test)

In [None]:
number_list = np.array(redy_train)

(unique, counts) = np.unique(number_list, return_counts=True)
frequencies = np.asarray((unique, counts)).T

frequencies

In [None]:
number_list = np.array(redy_test)

(unique, counts) = np.unique(number_list, return_counts=True)
frequencies = np.asarray((unique, counts)).T

frequencies

# K Nearest Neighbors - Red

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 50, 3):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(redX_train_scaled, redy_train.ravel())
    train_score = knn.score(redX_train_scaled, redy_train)
    test_score = knn.score(redX_test_scaled, redy_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 50, 3), train_scores, marker='o')
plt.plot(range(1, 50, 3), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Quality")

plt.savefig("Red Wine KNN plot"+'.jpg')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(redX_train_scaled, redy_train.ravel())

predictions = knn.predict(redX_test_scaled)
cm = confusion_matrix(redy_test, predictions, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=knn.classes_)
disp.plot()

plt.savefig("Red Wine KNN Matrix"+'.jpg')

plt.show()

# Scale White Wine Data

In [None]:
#assign data to X and y

white_X = white[["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]]
white_y = white["quality"].values.reshape(-1, 1)
print(white_X.shape, white_y.shape)

In [None]:
#train test split
from sklearn.model_selection import train_test_split

whiteX_train, whiteX_test, whitey_train, whitey_test = train_test_split(white_X, white_y, stratify=white_y, random_state=42)

In [None]:
#scale Data
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

whiteX_scaler = StandardScaler().fit(whiteX_train)

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models

whiteX_train_scaled = whiteX_scaler.transform(whiteX_train)
whiteX_test_scaled = whiteX_scaler.transform(whiteX_test)

In [None]:
number_list = np.array(whitey_train)

(unique, counts) = np.unique(number_list, return_counts=True)
frequencies = np.asarray((unique, counts)).T

frequencies

In [None]:
number_list = np.array(whitey_test)

(unique, counts) = np.unique(number_list, return_counts=True)
frequencies = np.asarray((unique, counts)).T

frequencies

# K Nearest Neighbors - White

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 50, 5):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(whiteX_train_scaled, whitey_train.ravel())
    train_score = knn.score(whiteX_train_scaled, whitey_train)
    test_score = knn.score(whiteX_test_scaled, whitey_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 50, 5), train_scores, marker='o')
plt.plot(range(1, 50, 5), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Quality")

plt.savefig("White Wine KNN plot"+'.jpg')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(whiteX_train_scaled, whitey_train.ravel())

predictions = knn.predict(whiteX_test_scaled)
cm = confusion_matrix(whitey_test, predictions, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=knn.classes_)
disp.plot()

plt.savefig("White Wine KNN Matrix"+'.jpg')

plt.show()

In [None]:
# used https://www.scikit-yb.org/en/latest/api/classifier/confusion_matrix.html as a reference
#from sklearn.datasets import load_digits
#from sklearn.model_selection import train_test_split as tts
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import ConfusionMatrixDisplay
#from sklearn import svm, datasets

In [None]:
#list(red)

In [None]:
#assign data to X and y
#from sklearn.metrics import confusion_matrix

#assign data to X and y
#X = red[["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]]
#y = red["quality"].values.reshape(-1, 1)
#class_names = list(red)


In [None]:
# Split the data into a training set and a test set
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
#classifier = svm.SVC(kernel="linear", C=0.01).fit(X_train, y_train.ravel())


In [None]:
#np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
#titles_options = [
    #("Confusion matrix, without normalization", None),
    #("Normalized confusion matrix", "true"),
#]
#for title, normalize in titles_options:
    #disp = ConfusionMatrixDisplay.from_estimator(
       # classifier,
       # X_test,
       # y_test,
       # display_labels=class_names,
       # cmap=plt.cm.Blues,
       # normalize=normalize,
   # )
    #disp.ax_.set_title(title)

   # print(title)
    #print(disp.confusion_matrix)
#
#plt.show()