# Wine Classifier

Let's guess the quality of a wine from it's characteristics using a set of several classifiers. We'll compare them and find the best one.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
import time

from numpy.random import permutation
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
TRAIN_FILE = 'input/winequality-white.csv'

df = pd.read_csv(TRAIN_FILE, sep=';', header=0) 
df.head(n=5)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
#print(df.columns.values)

X = df.drop('quality', 1).values
y = df['quality']

In [4]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators = 18),
    "Neural Net": MLPClassifier(alpha = 1),
    "Naive Bayes": GaussianNB()
}

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_test.shape)
print(X.shape[0])
print(X_test.shape[0])

(980, 11)
4898
980


In [12]:
no_classifiers = len(dict_classifiers.keys())

def batch_classify(X_train, y_train, X_test, y_test, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,4)), columns = ['classifier', 'train_score', 'test_score', 'training_time'])
    count = 0
    for key, classifier in dict_classifiers.items():
        t_start = time.clock()
        classifier.fit(X_train, y_train)
        t_end = time.clock()
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, y_train)
        test_score = classifier.score(X_test, y_test)
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'test_score'] = test_score
        df_results.loc[count,'training_time'] = t_diff
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
        count+=1
    return df_results

In [13]:
df_results = batch_classify(X_train, y_train, X_test, y_test)
display(df_results.sort_values(by='test_score', ascending=False))

trained Logistic Regression in 0.15 s
trained Nearest Neighbors in 0.00 s
trained Linear SVM in 1.32 s
trained Gradient Boosting Classifier in 2.07 s
trained Decision Tree in 0.04 s
trained Random Forest in 0.15 s
trained Neural Net in 0.44 s
trained Naive Bayes in 0.00 s


Unnamed: 0,classifier,train_score,test_score,training_time
5,Random Forest,0.997448,0.65,0.154232
4,Decision Tree,1.0,0.578571,0.036526
3,Gradient Boosting Classifier,0.736345,0.560204,2.066237
2,Linear SVM,0.826442,0.52551,1.321954
0,Logistic Regression,0.543134,0.483673,0.145947
1,Nearest Neighbors,0.658754,0.454082,0.004732
7,Naive Bayes,0.460694,0.442857,0.003092
6,Neural Net,0.476008,0.430612,0.43814
