In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [10]:
df = pd.read_excel("red_wine_quality.xlsx", header = 0)

df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [11]:
data = np.array(df, dtype=np.float32)

y_data = data[:, [-1]]

scaler = MinMaxScaler()
data1 = scaler.fit_transform(df.values)

x_data = data1[:, 0:-1]  

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print (x_data.shape)
print (x_train.shape)
print (x_test.shape)

(1599, 11)
(1279, 11)
(320, 11)


In [12]:
x_data

array([[0.24778761, 0.39726027, 0.        , ..., 0.60629921, 0.13772455,
        0.15384615],
       [0.28318584, 0.52054795, 0.        , ..., 0.36220472, 0.20958084,
        0.21538462],
       [0.28318584, 0.43835616, 0.04      , ..., 0.40944882, 0.19161677,
        0.21538462],
       ...,
       [0.15044248, 0.26712329, 0.13      , ..., 0.53543307, 0.25149701,
        0.4       ],
       [0.11504425, 0.35958904, 0.12      , ..., 0.65354331, 0.22754491,
        0.27692308],
       [0.12389381, 0.13013699, 0.47      , ..., 0.51181102, 0.19760479,
        0.4       ]])

In [13]:
# model comparison

models = {
    'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=5, random_state=1234),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100), 
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVM': SVC(kernel='rbf', C=10, gamma = 1)
}

for key, value in models.items(): 
    model = value
    model.fit(x_train, y_train.ravel())
    y_pred = model.predict(x_test)
    
    print('%s: %.3f%%' % (key, metrics.accuracy_score(y_pred, y_test)*100))

DecisionTreeClassifier: 58.438%
RandomForestClassifier: 71.875%
KNeighborsClassifier: 58.750%
SVM: 59.688%


In [14]:
# k-fold

models = {
    'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=5, random_state=1234),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100), 
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVM': SVC(kernel='rbf', C=10, gamma = 1)
}

cv = KFold(n_splits=5, shuffle = True, random_state=2019)

for key, value in models.items(): 
    model = value
    scores = cross_val_score(model, x_data, y_data.ravel(), cv = cv, scoring="accuracy")*100
    
    print('%s: %.3f%%' % (key, np.mean(scores)))
    print (np.round(scores, 3))

DecisionTreeClassifier: 58.163%
[57.5   57.5   60.938 54.688 60.188]
RandomForestClassifier: 68.355%
[67.812 67.5   72.812 65.625 68.025]
KNeighborsClassifier: 58.413%
[56.563 56.563 60.938 57.812 60.188]
SVM: 60.726%
[59.375 61.25  62.187 60.    60.815]
