In [0]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [0]:
df = pd.read_csv('winequality-white.csv', sep=';')

In [0]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [0]:
print (df.shape)
df.describe()

(4898, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [0]:
df['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [0]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

We have to treating the problem as a classification problem, so we are going to use a decision tree to learn a classification model that predicts red wine quality based on the features. Since we have to predict the wine quality the attribute "quality" will become our label and the rest of the attributes will become the features. 

The target variable "quality" ranges from 3 to 9. We can notice that the most observations are in class 6. In class 9 we a few observations.

In [0]:
X = df.drop('quality', axis = 1)

In [0]:
Y = df.quality

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state=42)

In [0]:
from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_train)
print (X_train_scaled)

[[ 0.51511931 -1.07623315  0.22773076 ..., -0.32826101 -0.70244474
   1.54037099]
 [-0.66918809 -0.28877673  0.89583195 ..., -0.06188569  0.26607415
  -0.82171197]
 [-1.49820327  0.40024764 -0.02280718 ...,  0.40427112  0.00193263
   0.48150622]
 ..., 
 [ 0.87041153 -0.09191263  1.56393313 ..., -1.12738698  0.35412132
   0.72585963]
 [-0.66918809 -0.38720878 -0.35685777 ..., -0.12847952 -0.96658625
   0.07425053]
 [ 1.46256523 -0.09191263  0.14421812 ...,  0.20448963  1.23459303
  -0.08865174]]


In [0]:
clf = DecisionTreeClassifier(criterion = 'entropy', splitter = 'best',  max_depth = 15)
clf.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

We just stored label "quality" in Y, which is the common used to represent the labels in machine learning and in X the features. Next we split our dataset into test and train data.Training data is the data on which the machine learning programs learn to perform correlational tasks. Testing data is the data, whose outcome is already known (even the outcome of training data is known) and is used to determine the accuracy of the machine learning algorithm, based on the training data (how effectively the learning happened).  We will be using train data to train our model for predicting the quality. Also we take the 20% of the original population and use it for testing.

In [0]:
y_pred = clf.predict(X_test)

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score (Y_test, y_pred)

0.59387755102040818

In [0]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph

In [0]:
prominent_features = clf.feature_importances_

In [0]:
for importance,feature in zip(prominent_features, X):
    print ('{}: {}'.format(feature, importance))

fixed acidity: 0.06557556264325042
volatile acidity: 0.10753496183975543
citric acid: 0.08086644487142618
residual sugar: 0.07928371615920106
chlorides: 0.07094039605859549
free sulfur dioxide: 0.12290418742076284
total sulfur dioxide: 0.09210864427977959
density: 0.06278396529013466
pH: 0.08893617999132633
sulphates: 0.07066157986474697
alcohol: 0.15840436158102103


We can observe that alcohol content and free sulfur dioxide play the two largest roles in the decision of classifier.