In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
df = pd.read_csv("data/winequality-red.csv", sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
# Check dirty data
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [4]:
# Split the dataset form target
x, y = df.drop(['quality'], axis=1), df['quality']

In [5]:
x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [6]:
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

In [7]:
def decisionTree(dataX, dataY, testSize):
    tree = DecisionTreeClassifier()

    # Split the dataset into training and testing
    x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=testSize, random_state=42, shuffle=True)
    tree.fit(x_train, y_train)

    y_pred_train = tree.predict(x_train)
    y_pred_test = tree.predict(x_test)

    # return predict data
    return accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

In [8]:
# Split the dataset train : test = 9 : 1

train1, test1 = decisionTree(x, y, 0.1)
print("[Train : Test = 9 : 1]\n")
print("Accuracy for train data: " ,train1)
print("Accuracy for test data: %.4f" % test1)

[Train : Test = 9 : 1]

Accuracy for train data:  1.0
Accuracy for test data: 0.5687


In [9]:
# Split the dataset train : test = 8 : 2

train2, test2 = decisionTree(x, y, 0.2)
print("[Train : Test = 8 : 2]\n")
print("Accuracy for train data: ", train2)
print("Accuracy for test data: %.4f" % test2)

[Train : Test = 8 : 2]

Accuracy for train data:  1.0
Accuracy for test data: 0.5625


In [10]:
# Split the dataset train : test = 8 : 3

train3, test3 = decisionTree(x, y, 0.3)
print("[Train : Test = 7 : 3]\n")
print("Accuracy for train data: ", train3)
print("Accuracy for test data: %.4f" % test3)

[Train : Test = 7 : 3]

Accuracy for train data:  1.0
Accuracy for test data: 0.5646
