# Import libraries

In [None]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Import data

In [None]:
df_red = pd.read_csv("Data/winequality-red.csv")
df_white = pd.read_csv("Data/winequality-white.csv")              


In [None]:
df_red.head()

In [None]:
df_white.head()

# clean data 
(turns out no cleaning was really needed)

In [None]:
print(f'red wine DF initial: {df_red.shape}')
print(f'white wine DF initial: {df_white.shape}')

In [None]:
#drop any null rows
df_red = df_red.dropna()
df_white = df_white.dropna()

In [None]:
print(f'red wine DF after dropNA: {df_red.shape}')
print(f'white wine DF dropNA: {df_white.shape}')

In [None]:

red_labels_count = np.unique(df_red['quality'])
white_labels_count = np.unique(df_white['quality'])
print(f'red wine unique quality ratings recorded: {red_labels_count}')
print(f'white wine unique quality ratings recorded: {white_labels_count}')

#### Interesting to note that neither the red nor the white datasets contain wines with all possible ratings ( to 10)

# train_test_split

In [None]:
#red wine y-values
red_targets = df_red["quality"]

#white wine y-values
white_targets =df_white["quality"]

In [None]:
#red wine x-values
red_features = df_red.drop(columns="quality")

#white wine x-values
white_features = df_white.drop(columns="quality")


In [None]:
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(red_features, red_targets, random_state = 43)

X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(white_features, white_targets, random_state = 43)

In [None]:
#checking category freqencies
train_number_list = np.array(y_train_red)
(unique, counts) = np.unique(train_number_list, return_counts=True)
train_frequencies = np.asarray((unique, counts)).T


test_number_list = np.array(y_test_red)
(unique_test, counts_test) = np.unique(test_number_list, return_counts=True)
test_frequencies = np.asarray((unique_test, counts_test)).T

print("train category freqencies")
print(train_frequencies)
print("test category frequencies")
print(test_frequencies)


# One hot encode y-values

Note that the red hot encoded values have 9 positions and the white ones have 10 positions. This is because the max quality value in the red data set is 8 and the max value in the white set is 9. See above for an example of this.  

In [None]:
oh_y_train_red = to_categorical(y_train_red)
oh_y_test_red = to_categorical(y_test_red)
oh_y_train_white = to_categorical(y_train_white)
oh_y_test_white = to_categorical(y_test_white)

print(oh_y_train_red[1])
print(oh_y_test_red[1])
print(oh_y_train_white[1])
print(oh_y_test_white[1])

# scale data

In [None]:
X_red_scaler = MinMaxScaler().fit(X_train_red)
X_white_scaler = MinMaxScaler().fit(X_train_white)
X_train_red_scaled = X_red_scaler.transform(X_train_red)
X_train_white_scaled = X_white_scaler.transform(X_train_white)
X_test_red_scaled = X_red_scaler.transform(X_test_red)
X_test_white_scaled = X_white_scaler.transform(X_test_white)

# instantiate and train the model

accuracy results without specifiying a "stratify" parameter:  <br>
red model accuracy: 0.6399999856948853 <br>
white model accuracy: 0.5493877530097961 <br>

accuracy results with "stratify" parameter:  <br>
red model accuracy: 0.4399999976158142   <br>
white model accuracy: 0.44897958636283875  <br>

Surprisingly, running the model with the stratify parameter returns much worse results for both red wine and white wine data.


In [None]:
red_features_count = len(red_features.columns)


#ouput layer units needs to be set to 9 because max score in the red dataset is "8"

def create_red_model():
    model = Sequential()
    model.add(Dense(units = 20, activation = "relu", input_dim = red_features_count))
    model.add(Dense(units = 9, activation = "softmax"))
    model.compile(optimizer = "adam", loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

# adding the Keras wrapper to support interaction with SKLearn
red_model = KerasClassifier(build_fn = create_red_model, epochs=100, batch_size =10)




In [None]:
red_model.fit(X_train_red_scaled, oh_y_train_red, epochs = 100, shuffle= True, verbose = 4)

In [None]:
white_features_count = len(white_features.columns)

#ouput layer units needs to be set to 10 because max quality score in the white dataset is "9"

def create_white_model():
    model = Sequential()
    model.add(Dense(units = 20, activation = "relu", input_dim = white_features_count))
    model.add(Dense(units = 10, activation = "softmax"))
    model.compile(optimizer = "adam", loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

# adding the Keras wrapper to support interaction with SKLearn
white_model = KerasClassifier(build_fn = create_white_model, epochs=100, batch_size =10)





In [None]:
white_model.fit(X_train_white_scaled, oh_y_train_white, epochs = 100, shuffle= True, verbose = 4)

# test models

In [None]:
red_model_accuracy = red_model.score(X_test_red_scaled, oh_y_test_red, verbose= 3)

print(f'red model accuracy: {red_model_accuracy}')

In [None]:
white_model_accuracy = white_model.score(X_test_white_scaled, oh_y_test_white, verbose= 3)

print(f'white model accuracy: {white_model_accuracy}')

# output confusion matrices

In [None]:
# red confusion matrix

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

red_predictions = red_model.predict(X_test_red_scaled)
cm = confusion_matrix(y_test_red, red_predictions, labels=red_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=red_model.classes_)

disp.plot()


plt.savefig('model_figures/red_neural_network_confusion.jpg')


In [None]:
# white confusion matrix

white_predictions = white_model.predict(X_test_white_scaled)
cm = confusion_matrix(y_test_white, white_predictions, labels= white_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= white_model.classes_)

# disp.xaxis.label.set_color('red')
disp.plot()


plt.savefig('model_figures/white_neural_network_confusion.jpg')
