# Evaluation of the model

First, we import our file with the custom functions along with other libraries useful for interacting with the data and testing:

In [1]:
import dec_tree
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import time

Now, we can import the data to be tested on and split it:

In [2]:
wine_data = pd.read_csv('wine_dataset.csv')
# Separate features and label
n = len(wine_data.columns) - 1
X = wine_data.iloc[:, :n]
y = wine_data.iloc[:, n]
# Split data
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.2, random_state=20)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=20)
wine_data

Unnamed: 0,citric acid,residual sugar,pH,sulphates,alcohol,type
0,0.13,1.60,3.34,0.59,9.2,1
1,0.10,2.80,3.60,0.66,10.2,1
2,0.32,1.90,3.20,0.55,9.5,1
3,0.29,13.65,3.00,0.60,9.5,0
4,0.26,2.00,3.41,0.74,9.2,1
...,...,...,...,...,...,...
3193,0.30,1.50,3.36,0.56,12.0,0
3194,0.23,6.20,2.89,0.34,10.1,0
3195,0.44,1.60,3.38,0.86,9.9,1
3196,0.36,4.50,3.40,0.57,10.4,0


Now, a quick statistical overview of the data:

In [3]:
wine_data.describe()

Unnamed: 0,citric acid,residual sugar,pH,sulphates,alcohol,type
count,3198.0,3198.0,3198.0,3198.0,3198.0,3198.0
mean,0.301776,4.449781,3.249678,0.574431,10.459725,0.5
std,0.165284,4.214445,0.163439,0.165587,1.143231,0.500078
min,0.0,0.6,2.74,0.22,8.0,0.0
25%,0.21,1.9,3.14,0.47,9.5,0.0
50%,0.3,2.4,3.24,0.55,10.2,0.5
75%,0.4,5.9375,3.36,0.65,11.2,1.0
max,1.66,65.8,4.01,2.0,14.9,1.0


It can be seen that looking at the mean for the label (type), we have an even class distribution. Therefore, accuracy may be an ideal performance measure. Now, we can test this data on our custom decision tree and Scikit-learn's version of a decision tree classifier. Let's first test our model with varying hyperparameters and pick the best one:

In [4]:
best_acc = 0
best_params = {}
best_model = None
prune_size = 0.2

# Iterate over all hyperparameters
for imp in ['entropy', 'gini']:
    for prune in [True, False]:
        start_time = time.time()
        # Build tree with training data
        current_tree = dec_tree.learn(X_train, y_train, imp, prune, prune_size)
        # Test performance on validation data
        current_acc = dec_tree.accuracy(np.column_stack((X_val, y_val)), current_tree)
        end_time = time.time()
        current_params = {'Impurity': imp, 'Pruning': prune}
        print(f'Accuracy of tree with the following parameters: {current_params}: {current_acc} with runtime: {round(end_time-start_time, 6)} seconds')

        # Save the model with best accuracy (and its parameters)
        if current_acc > best_acc:
            best_acc = current_acc
            best_params = current_params
            best_model = current_tree

print(f'Best parameters: {best_params} with accuracy: {best_acc}')

Accuracy of tree with the following parameters: {'Impurity': 'entropy', 'Pruning': True}: 0.88125 with runtime: 1.458103 seconds
Accuracy of tree with the following parameters: {'Impurity': 'entropy', 'Pruning': False}: 0.878125 with runtime: 1.05296 seconds
Accuracy of tree with the following parameters: {'Impurity': 'gini', 'Pruning': True}: 0.84375 with runtime: 1.337442 seconds
Accuracy of tree with the following parameters: {'Impurity': 'gini', 'Pruning': False}: 0.903125 with runtime: 1.089802 seconds
Best parameters: {'Impurity': 'gini', 'Pruning': False} with accuracy: 0.903125


Let's use the best model on testing data:

In [5]:
best_test_acc = dec_tree.accuracy(np.column_stack((X_test, y_test)), best_model)
print(f'Accuracy of the best tree ({best_params["Impurity"]}, pruning = {best_params["Pruning"]}) on test data: {best_test_acc}')

Accuracy of the best tree (gini, pruning = False) on test data: 0.90625


Let's compare this with Scikit-learn's implementation:

In [6]:
best_imp = 'entropy'
best_score = 0
best_model = None
# Iterate over impurity hyperparameters
for imp in ['entropy', 'gini']:
    # Build tree
    sk_tree = DecisionTreeClassifier(criterion=imp, random_state=20)

    # Time fitting and prediction on validation data
    start_time_sklearn = time.time()
    sk_tree.fit(X_train, y_train)
    sk_pred = sk_tree.predict(X_val)
    end_time_sklearn = time.time()

    # Update values for best model
    sk_acc = accuracy_score(y_val, sk_pred)
    if sk_acc > best_score:
        best_score = sk_acc
        best_imp = imp
        best_model = sk_tree
    print(f'Accuracy of sklearn\'s model using {imp}: {sk_acc} with runtime: {round(end_time_sklearn - start_time_sklearn, 6)} seconds')

Accuracy of sklearn's model using entropy: 0.871875 with runtime: 0.016211 seconds
Accuracy of sklearn's model using gini: 0.890625 with runtime: 0.012122 seconds


In [7]:
# Test on the best model
sk_test = best_model.predict(X_test)
sk_test_acc = accuracy_score(y_test, sk_test)
print(f'Accuracy of sklearn\'s best model on test data using {best_imp}: {sk_test_acc}')

Accuracy of sklearn's best model on test data using gini: 0.9
