# Import libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Import data
Import red wine data, white wine data

In [None]:
red_df = pd.read_csv("Data/winequality-red.csv")
white_df = pd.read_csv("Data/winequality-white.csv")

In [None]:
red_X = red_df.drop(columns="quality")
red_y = red_df["quality"].values.reshape(-1, 1)

white_X = white_df.drop(columns="quality")
white_y = white_df["quality"].values.reshape(-1, 1)

In [None]:
white_X.count

In [None]:
featureNames = red_X.columns
featureNames

# Train test split

Stratify training data to ensure training data and test data will include equal distribution of all quality scores

In [None]:
from sklearn.model_selection import train_test_split

redX_train, redX_test, redy_train, redy_test = train_test_split(red_X, red_y, stratify=red_y, random_state=42)
whiteX_train, whiteX_test, whitey_train, whitey_test = train_test_split(white_X, white_y, stratify=white_y, random_state=42)

In [None]:
# ensure count of quality scores reflect distribution of quality scores in data

number_list = np.array(redy_train)
# number_list = np.array(whitey_train)

(unique, counts) = np.unique(number_list, return_counts=True)
frequencies = np.asarray((unique, counts)).T

frequencies

# Scale data

function to scale using MinMaxScaler or StandardScaler

In [None]:
def scaleData(XTrain, scaler):
    if (scaler == "MinMaxScaler"):
        mm_scaler = MinMaxScaler().fit(XTrain)
        return mm_scaler
    
    elif (scaler == "StandardScaler"):
        ss_scaler = StandardScaler().fit(XTrain)
        return ss_scaler
    
    else:
        print ("Choose MinMaxScaler or StandardScaler")

In [None]:
# choose type of scaler

scaler = "MinMaxScaler"
# scaler = "StandardScaler"

X_red_scaler = scaleData(redX_train, scaler)

redX_train = X_red_scaler.transform(redX_train)
redX_test = X_red_scaler.transform(redX_test)

X_white_scaler = scaleData(whiteX_train, scaler)

whiteX_train = X_white_scaler.transform(whiteX_train)
whiteX_test = X_white_scaler.transform(whiteX_test)

# SVM Model on red wine data

In [None]:
from sklearn.svm import SVC 

red_model = SVC(kernel='linear')
red_model.fit(redX_train, redy_train.ravel())

# Accuracy of SVM Model on red wine data

In [None]:
# Support Vector Model Accuracy
print('Test Accuracy on red %.3f' % red_model.score(redX_test, redy_test))

# SVM Model on white wine data

In [None]:
white_model = SVC(kernel='linear')
# white_model.fit(whiteX_train, whitey_train.ravel())
white_model.fit(whiteX_train, whitey_train.ravel())

# Accuracy of SVM Model on white wine data

In [None]:
print('Test Accuracy on white %.3f' % white_model.score(whiteX_test, whitey_test))

# Classification Report on red wine data

In [None]:
red_quality_scores = ["Quality 3", "Quality 4", "Quality 5", "Quality 6", "Quality 7", "Quality 8"]

In [None]:
from sklearn.metrics import classification_report

# Calculate classification report for red

predictions = red_model.predict(redX_test)
print(classification_report(redy_test, predictions,
                            target_names=red_quality_scores))

# Classification Report on white wine data

In [None]:
white_quality_scores = ["Quality 3", "Quality 4", "Quality 5", "Quality 6", "Quality 7", "Quality 8", "Quality 9"]

In [None]:
# Calculate classification report for white

predictions = white_model.predict(whiteX_test)
print(classification_report(whitey_test, predictions,
                            target_names=white_quality_scores))

# Grid Search Parameter for red wine data

In [None]:
from sklearn.model_selection import GridSearchCV

# param_grid = {'C': [3, 4, 5, 6, 7]}
param_grid = {'C': [4, 4.5, 5.5, 6, 6.5]}

red_grid = GridSearchCV(red_model, param_grid, verbose=3)

In [None]:
red_grid.fit(redX_train, redy_train)

In [None]:
print(red_grid.best_params_)

In [None]:
print(red_grid.best_score_)

In [None]:
print(red_grid.score(redX_test,redy_test))

In [None]:
predictions = red_grid.predict(redX_test)

In [None]:

print(classification_report(redy_test, predictions,
                            target_names=red_quality_scores))

# Grid Search Parameter for white wine data

In [None]:
param_grid = {'C': [3, 4, 5, 6, 7]}

white_grid = GridSearchCV(white_model, param_grid, verbose=3)

In [None]:
white_grid.fit(whiteX_train, whitey_train)

In [None]:
print(white_grid.best_params_)

In [None]:
print(white_grid.best_score_)

In [None]:
print(white_grid.score(whiteX_test,whitey_test))

In [None]:
predictions = white_grid.predict(whiteX_test)

print(classification_report(whitey_test, predictions,
                            target_names=white_quality_scores))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

red_predictions = red_grid.predict(redX_test)

cm = confusion_matrix(redy_test, red_predictions, labels=red_grid.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=red_grid.classes_)
disp.plot()


plt.savefig("model_figures/SVM_conf_matrix_red.jpg")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

white_predictions = white_grid.predict(whiteX_test)

cm = confusion_matrix(whitey_test, white_predictions, labels=white_grid.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=white_grid.classes_)
disp.plot()

# plt.show()
plt.savefig("model_figures/SVM_conf_matrix_white.jpg")

# Weights of coefficients

Red wine

In [None]:
pd.Series(abs(red_model.coef_[0]), index=featureNames).nlargest(11).plot(kind='barh')
plt.savefig("model_figures/SVM_ftr_weight_red.jpg")

In [None]:
redCoef = pd.Series(abs(red_model.coef_[0]), index=featureNames).nlargest(11)
redCoef

White wine

In [None]:
pd.Series(abs(white_model.coef_[0]), index=featureNames).nlargest(11).plot(kind='barh')
plt.savefig("model_figures/SVM_ftr_weight_white.jpg")

In [None]:
whiteCoef = pd.Series(abs(white_model.coef_[0]), index=featureNames).nlargest(11)
whiteCoef

Combine red and white data to make dataset with all wine data  

In [None]:
red_df['type'] = pd.Series(["red" for x in range(len(red_df.index))])
red_df.head()
white_df['type'] = pd.Series(["white" for x in range(len(white_df.index))])
white_df.head()

In [None]:
redwhite_df = pd.concat([red_df, white_df])
redwhite_df.to_csv("red_white_data.csv")