Let's create a classification model on red wine quality. Then let's apply training and validation methodology to it. The classification model will be based on Naives Bayes. 

In [36]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [37]:
datasource = "datasets/winequality-red.csv"
print(os.path.exists(datasource))

True


In [38]:
df = pd.read_csv(datasource).sample(frac = 1).reset_index(drop = True)

In [39]:
df.head()

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1194,7.8,0.41,0.68,1.7,0.467,18.0,69.0,0.9973,3.08,1.31,9.3,5
1,523,9.8,0.44,0.47,2.5,0.063,9.0,28.0,0.9981,3.24,0.65,10.8,6
2,602,6.6,0.7,0.08,2.6,0.106,14.0,27.0,0.99665,3.44,0.58,10.2,5
3,1360,6.8,0.47,0.08,2.2,0.064,18.0,38.0,0.99553,3.3,0.65,9.6,6
4,862,9.6,0.38,0.42,1.9,0.071,5.0,13.0,0.99659,3.15,0.75,10.5,6


In [40]:
del df["Unnamed: 0"]

In [41]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.8,0.41,0.68,1.7,0.467,18.0,69.0,0.9973,3.08,1.31,9.3,5
1,9.8,0.44,0.47,2.5,0.063,9.0,28.0,0.9981,3.24,0.65,10.8,6
2,6.6,0.7,0.08,2.6,0.106,14.0,27.0,0.99665,3.44,0.58,10.2,5
3,6.8,0.47,0.08,2.2,0.064,18.0,38.0,0.99553,3.3,0.65,9.6,6
4,9.6,0.38,0.42,1.9,0.071,5.0,13.0,0.99659,3.15,0.75,10.5,6


In [42]:
print(df.shape)

(1599, 12)


In [43]:
X = np.array(df.iloc[:, :-1])[:, [1, 2, 6, 9, 10]]
# remove the quality column
# only grab the following columns:
# volatile acidity, citric acid, total sulfur dioxide
# sulphates, alcohol

In [44]:
y = np.array(df["quality"])

In [45]:
print(X.shape)

(1599, 5)


In [46]:
print(y.shape)

(1599,)


In [47]:
print("Label Distribution", {i: np.sum(y == i) for i in np.unique(df["quality"])})

Label Distribution {3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18}


In [48]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
m = GaussianNB()
m.fit(X_train, y_train)
m.score(X_test, y_test)

0.57499999999999996

In [50]:
actual = y[20:50]
actual

array([6, 6, 7, 5, 4, 5, 6, 6, 7, 7, 7, 5, 5, 7, 6, 5, 5, 5, 5, 4, 6, 5, 5,
       6, 5, 5, 7, 5, 5, 6], dtype=int64)

In [51]:
pred = m.predict(X[20:50])
pred

array([5, 6, 7, 5, 5, 5, 6, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5, 5, 5, 5, 6, 5, 5,
       5, 6, 5, 6, 6, 5, 6], dtype=int64)

Let's create a confusion matrix

In [52]:
pred = np.round(m.predict(X_test)).astype("i4")
cm = confusion_matrix(y_test, pred)
cm

array([[  0,   1,   2,   0,   0,   0],
       [  1,   3,   6,   2,   0,   0],
       [  1,   5, 127,  42,   2,   0],
       [  0,   4,  46,  78,  30,   0],
       [  0,   0,   4,  19,  22,   0],
       [  0,   0,   0,   2,   3,   0]])