# Comparing CART and MIO methods to create decision trees

Import packages needed for CART and MIO methods

In [13]:
import tree as miptree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz
import pydot
from sklearn import tree


import time
from os import path
import numpy as np
import pandas as pd

Define function to convert data into binary:

In [14]:
def binary_features(array):
    output = array.copy()
    for i in array.index:
        if 0 < array.num[i]:
            output.num[i] = 1
    return output

Set arguments:

In [15]:
timelimit = 600
seed = 42

min_samples_split=2
alpha = [0, 0.01, 0.1]
depth = [2, 3]
seeds = [37, 42]

train_ratio = 0.5
val_ratio = 0.25
test_ratio = 0.25


Import and manipulate data:

In [16]:
# Import data
heart_disease = fetch_ucirepo(id=45)
x = heart_disease.data.features
y = heart_disease.data.targets
# Normalize x data to be between 0 and 1 for each feature
x = (x - x.min()) / (x.max() - x.min())
# remove NaN values from dataset
x = x.dropna(axis=0)
y = y.iloc[x.index]
#Create random samples of data
x1 = x.sample(n=75, random_state=seeds[0])
x2 = x.sample(n=75, random_state=seeds[1])
# Create random samples of data for y
y1 = y.sample(n=75, random_state=seeds[0])
y2 = y.sample(n=75, random_state=seeds[1])
# Create list of datasets
datasets = ['Heart failure']
# Define features
features = x.columns
# Convert to numpy array
x1 = x1.to_numpy()
x2 = x2.to_numpy()
x = x.to_numpy()
# Convert to pandas dataframe
y1 = binary_features(y1)
y1 = y1.values.flatten()
y2 = binary_features(y2)
y2 = y2.values.flatten()
y = binary_features(y)
y = y.values.flatten()



In [20]:
# create or load table
res_sk = pd.DataFrame(columns=['instance', 'depth', 'seed', 'train_acc', 'val_acc', 'test_acc', 'train_time'])
if path.isfile('./res/oct.csv'):
    res_oct = pd.read_csv('./res/oct.csv')
else:
    res_oct = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])

# x1 and y1

In [6]:
#CART
for data in datasets:
    for d in depth:
        for s in seeds:
            x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=1-train_ratio, random_state=s)
            x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                            test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
            for a in alpha:
                clf = tree.DecisionTreeClassifier(max_depth=d, min_samples_split=2, ccp_alpha=a)
                tick = time.time()
                clf.fit(x_train, y_train)
                tock = time.time()
                train_time = tock - tick
                train_acc = accuracy_score(y_train, clf.predict(x_train))
                val_acc = accuracy_score(y_val, clf.predict(x_val))
                test_acc = accuracy_score(y_test, clf.predict(x_test))
                print(data, 'cart-d{}-a{}-s{}'.format(d,a,s), 'train acc:', train_acc, 'val acc:', val_acc)
                row = {'instance':data, 'depth':d, 'seed':s, 'train_acc':train_acc, 
                    'val_acc':val_acc, 'test_acc':test_acc, 'train_time':train_time}
                res_sk = res_sk._append(row, ignore_index=True)
                res_sk.to_csv('./res/sk.csv', index=False)

Heart failure cart-d2-a0-s37 train acc: 0.972972972972973 val acc: 0.7368421052631579
Heart failure cart-d2-a0.01-s37 train acc: 0.972972972972973 val acc: 0.7368421052631579
Heart failure cart-d2-a0.1-s37 train acc: 0.8918918918918919 val acc: 0.6842105263157895
Heart failure cart-d2-a0-s42 train acc: 0.8648648648648649 val acc: 0.631578947368421
Heart failure cart-d2-a0.01-s42 train acc: 0.8648648648648649 val acc: 0.631578947368421
Heart failure cart-d2-a0.1-s42 train acc: 0.8108108108108109 val acc: 0.631578947368421
Heart failure cart-d3-a0-s37 train acc: 0.972972972972973 val acc: 0.7368421052631579
Heart failure cart-d3-a0.01-s37 train acc: 0.972972972972973 val acc: 0.7368421052631579
Heart failure cart-d3-a0.1-s37 train acc: 0.8918918918918919 val acc: 0.6842105263157895
Heart failure cart-d3-a0-s42 train acc: 0.918918918918919 val acc: 0.631578947368421
Heart failure cart-d3-a0.01-s42 train acc: 0.918918918918919 val acc: 0.631578947368421
Heart failure cart-d3-a0.1-s42 train

  res_sk = res_sk._append(row, ignore_index=True)


Create tree using miptree:

In [7]:
for data in datasets:
    for d in depth:
        for s in seeds:
            # data splition
            x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=1-train_ratio, random_state=s)
            x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                            test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
            for a in alpha:
                # oct
                row = res_oct[(res_oct['instance'] == data) & (res_oct['depth'] == d) & 
                              (res_oct['alpha'] == a) & (res_oct['seed'] == s)]
                if len(row):
                    print(data, 'oct-d{}-a{}'.format(row['depth'].values[0],row['alpha'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = miptree.optimalDecisionTreeClassifier(max_depth=d, min_samples_split=2, alpha=a, warmstart=True,
                                                                   timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    res_oct = res_oct._append(row, ignore_index=True)
                    res_oct.to_csv('./res/oct.csv', index=False)
                    print(data, 'oct-d{}-a{}-s{}'.format(d,a,s), 
                          'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)

Set parameter Username
Academic license - for non-commercial use only - expires 2024-10-02


  res_oct = res_oct._append(row, ignore_index=True)


Heart failure oct-d2-a0-s37 train acc: 0.972972972972973 val acc: 0.7894736842105263 train_time: 9.555217981338501 gap: 0.0
Heart failure oct-d2-a0.01-s37 train acc: 0.972972972972973 val acc: 0.7368421052631579 train_time: 3.157528877258301 gap: 0.0
Heart failure oct-d2-a0.1-s37 train acc: 0.8918918918918919 val acc: 0.6842105263157895 train_time: 2.6345627307891846 gap: 0.0
Heart failure oct-d2-a0-s42 train acc: 0.918918918918919 val acc: 0.7894736842105263 train_time: 6.175588846206665 gap: 0.0
Heart failure oct-d2-a0.01-s42 train acc: 0.918918918918919 val acc: 0.7894736842105263 train_time: 5.092710971832275 gap: 0.0
Heart failure oct-d2-a0.1-s42 train acc: 0.8108108108108109 val acc: 0.6842105263157895 train_time: 7.619994163513184 gap: 0.0
Heart failure oct-d3-a0-s37 train acc: 1.0 val acc: 0.7894736842105263 train_time: 5.988437175750732 gap: 0.0
Heart failure oct-d3-a0.01-s37 train acc: 1.0 val acc: 0.7368421052631579 train_time: 19.434585094451904 gap: 0.0
Heart failure oct-d

# Full dataset

In [21]:
#CART
for data in datasets:
    for d in depth:
        for s in seeds:
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
            x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                            test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
            for a in alpha:
                clf = tree.DecisionTreeClassifier(max_depth=d, min_samples_split=2, ccp_alpha=a)
                tick = time.time()
                clf.fit(x_train, y_train)
                tock = time.time()
                train_time = tock - tick
                train_acc = accuracy_score(y_train, clf.predict(x_train))
                val_acc = accuracy_score(y_val, clf.predict(x_val))
                test_acc = accuracy_score(y_test, clf.predict(x_test))
                print(data, 'cart-d{}-a{}'.format(d,a), 'train acc:', train_acc, 'val acc:', val_acc)
                row = {'instance':data, 'depth':d, 'seed':s, 'train_acc':train_acc, 
                    'val_acc':val_acc, 'test_acc':test_acc, 'train_time':train_time}
                res_sk = res_sk._append(row, ignore_index=True)
                res_sk.to_csv('./res/sk.csv', index=False)

Heart failure cart-d2-a0 train acc: 0.831081081081081 val acc: 0.7837837837837838
Heart failure cart-d2-a0.01 train acc: 0.831081081081081 val acc: 0.7837837837837838
Heart failure cart-d2-a0.1 train acc: 0.7972972972972973 val acc: 0.7432432432432432
Heart failure cart-d2-a0 train acc: 0.8243243243243243 val acc: 0.6621621621621622
Heart failure cart-d2-a0.01 train acc: 0.8243243243243243 val acc: 0.6621621621621622
Heart failure cart-d2-a0.1 train acc: 0.7972972972972973 val acc: 0.6621621621621622
Heart failure cart-d3-a0 train acc: 0.8851351351351351 val acc: 0.8108108108108109
Heart failure cart-d3-a0.01 train acc: 0.8851351351351351 val acc: 0.8108108108108109
Heart failure cart-d3-a0.1 train acc: 0.7972972972972973 val acc: 0.7432432432432432
Heart failure cart-d3-a0 train acc: 0.8716216216216216 val acc: 0.6891891891891891
Heart failure cart-d3-a0.01 train acc: 0.8716216216216216 val acc: 0.7027027027027027
Heart failure cart-d3-a0.1 train acc: 0.7972972972972973 val acc: 0.662

  res_sk = res_sk._append(row, ignore_index=True)


In [22]:
# OCT
for data in datasets:
    for d in depth:
        for s in seeds:
            # data splition
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
            x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                            test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
            for a in alpha:
                # oct
                row = res_oct[(res_oct['instance'] == data) & (res_oct['depth'] == d) & 
                              (res_oct['alpha'] == a) & (res_oct['seed'] == s)]
                if len(row):
                    print(data, 'oct-d{}-a{}'.format(row['depth'].values[0],row['alpha'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = miptree.optimalDecisionTreeClassifier(max_depth=d, min_samples_split=2, alpha=a, warmstart=True,
                                                                   timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    res_oct = res_oct._append(row, ignore_index=True)
                    res_oct.to_csv('./res/oct.csv', index=False)
                    print(data, 'oct-d{}-a{}-s{}'.format(d,a,s), 
                          'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)

  res_oct = res_oct._append(row, ignore_index=True)


Heart failure oct-d2-a0-s37 train acc: 0.8513513513513513 val acc: 0.7702702702702703 train_time: 455.47877192497253 gap: 0.0
Heart failure oct-d2-a0.01-s37 train acc: 0.8513513513513513 val acc: 0.7702702702702703 train_time: 585.7548499107361 gap: 0.0
Heart failure oct-d2-a0.1-s37 train acc: 0.7972972972972973 val acc: 0.7432432432432432 train_time: 214.47342205047607 gap: 0.0
Heart failure oct-d2-a0-s42 train acc: 0.8513513513513513 val acc: 0.7027027027027027 train_time: 213.9971420764923 gap: 0.0
Heart failure oct-d2-a0.01-s42 train acc: 0.8513513513513513 val acc: 0.6621621621621622 train_time: 436.1121709346771 gap: 0.0
Heart failure oct-d2-a0.1-s42 train acc: 0.7972972972972973 val acc: 0.6621621621621622 train_time: 394.01096510887146 gap: 0.0
Heart failure oct-d3-a0-s37 train acc: 0.9054054054054054 val acc: 0.8243243243243243 train_time: 601.0050699710846 gap: 0.9999999999999999
Heart failure oct-d3-a0.01-s37 train acc: 0.8851351351351351 val acc: 0.8108108108108109 train_ti