In [86]:
import pandas as pd
import numpy as np
import random
import numpy as np

from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
csv_path = "/content/winequality-red.csv"

In [98]:
df = pd.read_csv(csv_path)

In [5]:
# index_names = wines[wines["quality"] == 3].index
# wines.drop(index_names, inplace=True)

In [99]:
wines = df.sample(frac=1).copy()
train_dataset = wines.sample(frac=0.8, random_state=0)
wines = wines.drop(train_dataset.index)
validation_dataset = wines.sample(frac=0.5, random_state=0)
test_dataset = wines.drop(validation_dataset.index)
# test_dataset = wines.drop(train_dataset.index)

In [100]:
print(train_dataset)
print(validation_dataset)
print(test_dataset)

assert len(train_dataset) + len(validation_dataset) + len(test_dataset) == len(df)

      fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
869             7.6              0.63         0.03  ...       0.64     10.9        6
1216            7.9              0.57         0.31  ...       0.69      9.5        6
935             8.8              0.30         0.38  ...       0.72     11.8        6
798             9.4              0.50         0.34  ...       0.52     10.7        6
1351            7.2              0.62         0.01  ...       0.51     11.8        6
...             ...               ...          ...  ...        ...      ...      ...
236             7.2              0.63         0.00  ...       0.58      9.0        6
760             9.0              0.58         0.25  ...       0.57      9.7        5
95              4.7              0.60         0.17  ...       0.60     12.9        6
1143            7.0              0.22         0.30  ...       0.82     10.0        6
837             6.7              0.28         0.28  ...       0.3

In [101]:
# Train data
X_train = train_dataset.values[:, :11]
Y_train = train_dataset.values[:, 11]

# # # Validation data
X_val = validation_dataset.values[:, :11]
Y_val = validation_dataset.values[:, 11]

# Test data
X_test = test_dataset.values[:, :11]
Y_test = test_dataset.values[:, 11]

In [102]:
rbf = svm.SVC(kernel='rbf', gamma="auto", C=0.8).fit(X_train, Y_train)
poly = svm.SVC(kernel='poly', C=0.8, coef0=2).fit(X_train, Y_train)
linear = svm.SVC(kernel='linear', C=0.8).fit(X_train, Y_train)
random_forest_classifier = RandomForestClassifier(n_estimators=150).fit(X_train, Y_train)

In [69]:
forrest_cv = cross_val_score(random_forest_classifier, X_val, Y_val, cv=5)



In [25]:
print(f"Random forest CV accuracy: {forrest_cv.mean():.2f} % with a standard deviation of: {forrest_cv.std():.2f} ")

Random forest CV accuracy: 0.63 % with a standard deviation of: 0.07 


In [79]:
rbf_cv = cross_val_score(rbf, X_val, Y_val, cv=5)
poly_cv = cross_val_score(poly, X_val, Y_val, cv=5)
linear_cv = cross_val_score(linear, X_val, Y_val, cv=5)



In [26]:
print(f"Rbf CV accuracy: {rbf_cv.mean():.2f} % with a standard deviation of: {rbf_cv.std():.2f} ")

Rbf CV accuracy: 0.48 % with a standard deviation of: 0.04 


In [27]:
print(f"Polynomial CV accuracy: {poly_cv.mean():.2f} % with a standard deviation of: {poly_cv.std():.2f} ")

Polynomial CV accuracy: 0.51 % with a standard deviation of: 0.02 


In [28]:
print(f"Linear CV accuracy: {linear_cv.mean():.2f} % with a standard deviation of: {linear_cv.std():.2f} ")

Linear CV accuracy: 0.54 % with a standard deviation of: 0.08 


In [103]:
poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)
linear_pred = linear.predict(X_test)
random_forest_pred = random_forest_classifier.predict(X_test)

In [104]:
random_forest_acc = accuracy_score(Y_test, random_forest_pred)
random_forest_recall = recall_score(Y_test, random_forest_pred, average="weighted")
random_forest_f1 = f1_score(Y_test, random_forest_pred, average="weighted")
random_forest_conf = confusion_matrix(Y_test, random_forest_pred)
print("Random forest:")
print(f'Accuracy: {random_forest_acc*100:.2f} %')
print(f'Recall: {random_forest_recall*100:.2f} %')
print(f'F1: {random_forest_f1*100:.2f} %')
print('Confusion Matrix:')
print(random_forest_conf)

Random forest:
Accuracy: 69.38 %
Recall: 69.38 %
F1: 67.16 %
Confusion Matrix:
[[ 0  0  2  0  0  0]
 [ 0  0  2  4  0  0]
 [ 0  0 57 13  0  0]
 [ 0  0 16 44  2  0]
 [ 0  0  1  6  9  0]
 [ 0  0  0  2  1  1]]


In [92]:
poly_accuracy = accuracy_score(Y_test, poly_pred)
poly_recall = recall_score(Y_test, poly_pred, average="weighted")
poly_f1 = f1_score(Y_test, poly_pred, average='weighted')
poly_conf = confusion_matrix(Y_test, poly_pred)
print("Polynomial Kernel:")
print(f'Accuracy: {poly_accuracy*100:.2f} %')
print(f'Recall: {poly_recall*100:.2f} %')
print(f'F1: {poly_f1*100:.2f} %')
print('Confusion Matrix:')
print(poly_conf)

Polynomial Kernel:
Accuracy: 53.75 %
Recall: 53.75 %
F1: 48.77 %
Confusion Matrix:
[[ 0  0  0  1  0  0]
 [ 0  0  4  3  0  0]
 [ 0  0 45 21  0  0]
 [ 0  0 25 41  0  0]
 [ 0  0  0 18  0  0]
 [ 0  0  0  2  0  0]]


In [93]:
rbf_accuracy = accuracy_score(Y_test, rbf_pred)
rbf_recall = recall_score(Y_test, rbf_pred, average="weighted")
rbf_f1 = f1_score(Y_test, rbf_pred, average='weighted')
rbf_conf = confusion_matrix(Y_test, poly_pred)
print("RBF Kernel:")
print(f'Accuracy:  {rbf_accuracy*100:.2f} %')
print(f'Recall: {rbf_recall*100:.2f} %')
print(f'F1: {rbf_f1*100:.2f} %')
print('Confusion Matrix:')
print(rbf_conf)

RBF Kernel:
Accuracy:  53.12 %
Recall: 53.12 %
F1: 49.50 %
Confusion Matrix:
[[ 0  0  0  1  0  0]
 [ 0  0  4  3  0  0]
 [ 0  0 45 21  0  0]
 [ 0  0 25 41  0  0]
 [ 0  0  0 18  0  0]
 [ 0  0  0  2  0  0]]


In [94]:
linear_acc = accuracy_score(Y_test, linear_pred)
linear_recall = recall_score(Y_test, linear_pred, average="weighted")
linear_f1 = f1_score(Y_test, linear_pred, average='weighted')
linear_conf = confusion_matrix(Y_test, linear_pred)
print("Linear Kernel:")
print(f'Accuracy: {linear_acc*100:.2f} %')
print(f'Recall: {linear_recall*100:.2f} %')
print(f'F1: {linear_f1*100:.2f} %')
print('Confusion Matrix:')
print(linear_conf)

Linear Kernel:
Accuracy: 56.88 %
Recall: 56.88 %
F1: 51.42 %
Confusion Matrix:
[[ 0  0  1  0  0  0]
 [ 0  0  6  1  0  0]
 [ 0  0 49 17  0  0]
 [ 0  0 24 42  0  0]
 [ 0  0  0 18  0  0]
 [ 0  0  0  2  0  0]]
