Let's create a classification model on red wine quality. Then let's apply training and validation methodology to it. The classification model will be based on Naives Bayes. 

In [59]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [60]:
datasource = "datasets/winequality-red.csv"
print(os.path.exists(datasource))

True


In [61]:
df = pd.read_csv(datasource).sample(frac = 1).reset_index(drop = True)

In [62]:
df.head()

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,671,7.4,0.36,0.29,2.6,0.087,26.0,72.0,0.99645,3.39,0.68,11.0,5
1,277,8.2,1.0,0.09,2.3,0.065,7.0,37.0,0.99685,3.32,0.55,9.0,6
2,752,8.0,0.6,0.22,2.1,0.08,25.0,105.0,0.99613,3.3,0.49,9.9,5
3,570,5.6,0.62,0.03,1.5,0.08,6.0,13.0,0.99498,3.66,0.62,10.1,4
4,908,5.7,0.6,0.0,1.4,0.063,11.0,18.0,0.99191,3.45,0.56,12.2,6


In [63]:
del df["Unnamed: 0"]

In [64]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.36,0.29,2.6,0.087,26.0,72.0,0.99645,3.39,0.68,11.0,5
1,8.2,1.0,0.09,2.3,0.065,7.0,37.0,0.99685,3.32,0.55,9.0,6
2,8.0,0.6,0.22,2.1,0.08,25.0,105.0,0.99613,3.3,0.49,9.9,5
3,5.6,0.62,0.03,1.5,0.08,6.0,13.0,0.99498,3.66,0.62,10.1,4
4,5.7,0.6,0.0,1.4,0.063,11.0,18.0,0.99191,3.45,0.56,12.2,6


In [65]:
print(df.shape)

(1599, 12)


In [66]:
X = np.array(df.iloc[:, :-1])[:, [1, 2, 6, 9, 10]]
# remove the quality column
# only grab the following columns:
# volatile acidity, citric acid, total sulfur dioxide
# sulphates, alcohol

In [67]:
y = np.array(df["quality"])

In [68]:
print(X.shape)

(1599, 5)


In [69]:
print(y.shape)

(1599,)


In [70]:
print("Label Distribution", {i: np.sum(y == i) for i in np.unique(df["quality"])})

Label Distribution {3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18}


In [71]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
m = GaussianNB()
m.fit(X_train, y_train)
m.score(X_test, y_test)

0.59499999999999997

In [73]:
actual = y[20:50]
actual

array([5, 5, 5, 5, 6, 7, 6, 6, 5, 6, 6, 7, 5, 5, 6, 5, 7, 5, 5, 5, 5, 6, 5,
       6, 5, 6, 5, 5, 7, 7], dtype=int64)

In [74]:
pred = m.predict(X[20:50])
pred

array([5, 6, 5, 5, 7, 6, 7, 6, 6, 6, 7, 6, 6, 5, 6, 6, 7, 5, 5, 6, 5, 5, 6,
       6, 5, 6, 6, 5, 7, 5], dtype=int64)

Let's create a confusion matrix

In [75]:
pred = np.round(m.predict(X_test)).astype("i4")

# remember the labels from before:
# Label Distribution {3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18}

labels = [3, 4, 5, 6, 7, 8]

cm = confusion_matrix(y_test, pred, labels)
cm = pd.DataFrame(cm).reset_index(drop = True)
cm.columns = labels
cm.index = labels
cm

Unnamed: 0,3,4,5,6,7,8
3,0,0,1,0,0,0
4,0,1,10,4,1,0
5,0,1,134,37,3,1
6,0,4,50,85,20,0
7,0,0,1,26,18,0
8,0,0,0,2,1,0


Let's move beyond the confusion matrix. Here are a couple additional measures for the model.

* True Positive = Correct prediction of a class
* True Negative = Correct prediction of not a class (correct rejection)
* False Positive = Misclassification of a class (Type 1 errror)
* False Negative = Missclassficiation of not a class (Type 2 error)

Recall or True Positive Rate: $$ Recall = \frac{TP}{P} = \frac{TP}{TP+FN} $$ 
Precision or Positive Predictive Value:$$ Precision = \frac{TP}{TP+FP} $$

F1 is the harmonic mean of precision and sensitivity: $$ F_{1} = 2 * \frac{Precision * Recall}{Precision + Recall}$$

* http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
* http://scikit-learn.org/stable/modules/model_evaluation.html

In [76]:
f1_score(y_test, pred, average = "micro")

0.59499999999999997