In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.preprocessing import RobustScaler, PolynomialFeatures, StandardScaler
from time import time
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ' , sep = ';')

# Drop the three most irrelevant features

features = df.drop('quality', 1).values
def isGood(quality):
    if quality >= 6:
        return 1
    else:
        return 0
df['binary_quality'] = df['quality'].apply(isGood)
label = df['binary_quality'].values

In [3]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42, stratify=label)

In [7]:
def rescale():
    global X_train, X_test
    scaler = RobustScaler()
    # Fit only to the training data, then apply the transformations to the data
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn_model = knn.fit(X_train, y_train)

In [None]:
print('k-NN accuracy for test set before rescaling: %f' % knn_model.score(X_test, y_test))

In [None]:
scaler = RobustScaler()
# Fit only to the training data, then apply the transformations to the data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn_model = knn.fit(X_train, y_train)
print('k-NN accuracy for test set after rescaling: %f' % knn_model.score(X_test, y_test))

In [None]:
y_true, y_pred = y_test, knn_model.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [None]:
print("SVM accuracy: "+str(svm_model.score(X_test, y_test)))

In [None]:
# Without Rescaling
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42, stratify=label)
svm_model = SVC()
svm_model.fit(X_train, y_train)
print("SVM accuracy without rescaling: "+str(svm_model.score(X_test, y_test)))

In [None]:
polynomial_svm_model = Pipeline((
    ("poly_features", PolynomialFeatures(degree=2)),
    ("scaler", StandardScaler()),
    ("svm_clf",LinearSVC())))
polynomial_svm_model.fit(X_train, y_train)
print("2-dimensional Nonlinear SVM accuracy: "+str(polynomial_svm_model.score(X_test, y_test)))

In [None]:
polynomial_svm_model = Pipeline((
    ("poly_features", PolynomialFeatures(degree=5)),
    ("scaler", StandardScaler()),
    ("svm_clf",LinearSVC())))
polynomial_svm_model.fit(X_train, y_train)
print("3-dimensional Nonlinear SVM accuracy: "+str(polynomial_svm_model.score(X_test, y_test)))

In [None]:
polynomial_svm_model = Pipeline((
    ("scaler", StandardScaler()),
    ("svm_clf",SVC(kernel="poly", degree=3))))
polynomial_svm_model.fit(X_train, y_train)
print("3-dimensional SVM with polynomial kernel accuracy: "+str(polynomial_svm_model.score(X_test, y_test)))

In [None]:
polynomial_svm_model = Pipeline((
    ("scaler", StandardScaler()),
    ("svm_clf",SVC(kernel="poly", degree=5))))
polynomial_svm_model.fit(X_train, y_train)
print("5-dimensional SVM with polynomial kernel accuracy: "+str(polynomial_svm_model.score(X_test, y_test)))

In [None]:
# Plot the relationship between tree depths and prediction power
dimension = []
accuracy = []
for x in range(1, 10):
    dimension.append(x)
    polynomial_svm_model = Pipeline((
        ("scaler", StandardScaler()),
        ("svm_clf",SVC(kernel="poly", degree=x))))
    polynomial_svm_model.fit(X_train, y_train)
    accu = polynomial_svm_model.score(X_test, y_test)
    accuracy.append(accu)
    plt.xlabel("Dimension", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
plt.plot(dimension, accuracy)
plt.show()

In [None]:
# Plot the relationship between tree depths and prediction power
depth = []
accuracy = []
for x in range(1, 200):
    depth.append(x)
    tree_model = DecisionTreeClassifier(max_depth=x)
    tree_model.fit(X_train, y_train)
    accu = tree_model.score(X_test, y_test)
    accuracy.append(accu)
    plt.xlabel("Depth", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
plt.plot(depth, accuracy)
plt.show()

In [None]:
# Naive Bayes: no rescaling to avoid negative values
nb_model = GaussianNB()
# nb_model = MultinomialNB()
# nb_model = BernoulliNB()
nb_model.fit(X_train, y_train)
print('Naive Bayes accuracy for test set: %f' % nb_model.score(X_test, y_test))

In [8]:
rescale()
mlp_model = MLPClassifier(hidden_layer_sizes=(10,10),solver='sgd',learning_rate_init=0.1,max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)
print('Neural Network accuracy for test set: %f' % mlp_model.score(X_test, y_test))

Neural Network accuracy for test set: 0.762500
