# Comparing CART and MIO methods to create decision trees

Import packages needed for CART and MIO methods

In [1]:
import tree as miptree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree
import time
from os import path
import pandas as pd
import dataset as ds
import math

In [2]:
timelimit = 600
datasets = ['Breast Cancer', 'Car Evaluation', 'Credit Approval', 'Heart Disease', 'Ionosphere', 'Iris', 'Mushroom',
             'Statlog (German Credit Data)', 'Wine', 'Zoo']
alpha = [0, 0.01, 0.1]
depth = [1, 2, 3, 4]
seeds = [1, 2, 3]
train_ratio = 0.5
val_ratio = 0.25
test_ratio = 0.25

In [3]:
# create or load table
ResCART = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 'train_acc', 'val_acc', 'test_acc', 'train_time'])
if path.isfile('./res/OCTWS.csv'):
    ResOCTWS = pd.read_csv('./res/OCTWS.csv')
else:
    ResOCTWS = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])
if path.isfile('./res/OCTNoWS.csv'):
    ResOCTNoWS = pd.read_csv('./res/OCTNoWS.csv')
else:
    ResOCTNoWS = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])
if path.isfile('./res/OCTWSModified.csv'):
    ResOCTWSModified = pd.read_csv('./res/OCTWSModified.csv')
else:
    ResOCTWSModified = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])

# CART

In [17]:
#CART
for data in datasets:
    # load data
    x, y = ds.loadData(data)
    for s in seeds:
        # split data
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                        test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
        # Set min_samples_leaf to 1/25 of the training set size
        min_samples_leaf = math.floor(len(x_train)/25)
        for d in depth:
            for a in alpha:
                clf = tree.DecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, ccp_alpha=a)
                tick = time.time()
                clf.fit(x_train, y_train)
                tock = time.time()
                train_time = tock - tick
                train_acc = accuracy_score(y_train, clf.predict(x_train))
                val_acc = accuracy_score(y_val, clf.predict(x_val))
                test_acc = accuracy_score(y_test, clf.predict(x_test))
                print(data, 'CART-d{}-a{}-s{}'.format(d,a,s), 'train acc:', train_acc, 'val acc:', val_acc)
                row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 
                    'val_acc':val_acc, 'test_acc':test_acc, 'train_time':train_time}
                ResCART = ResCART._append(row, ignore_index=True)
                ResCART.to_csv('./res/CART.csv', index=False)

Breast Cancer CART-d1-a0-s1 train acc: 0.7391304347826086 val acc: 0.6376811594202898
Breast Cancer CART-d1-a0.01-s1 train acc: 0.7391304347826086 val acc: 0.6376811594202898
Breast Cancer CART-d1-a0.1-s1 train acc: 0.6811594202898551 val acc: 0.6666666666666666
Breast Cancer CART-d2-a0-s1 train acc: 0.7753623188405797 val acc: 0.6231884057971014
Breast Cancer CART-d2-a0.01-s1 train acc: 0.7753623188405797 val acc: 0.6231884057971014
Breast Cancer CART-d2-a0.1-s1 train acc: 0.6811594202898551 val acc: 0.6666666666666666
Breast Cancer CART-d3-a0-s1 train acc: 0.8188405797101449 val acc: 0.6666666666666666
Breast Cancer CART-d3-a0.01-s1 train acc: 0.8188405797101449 val acc: 0.6666666666666666
Breast Cancer CART-d3-a0.1-s1 train acc: 0.6811594202898551 val acc: 0.6666666666666666
Breast Cancer CART-d4-a0-s1 train acc: 0.8260869565217391 val acc: 0.6811594202898551
Breast Cancer CART-d4-a0.01-s1 train acc: 0.8188405797101449 val acc: 0.6666666666666666
Breast Cancer CART-d4-a0.1-s1 train 

  ResCART = ResCART._append(row, ignore_index=True)


Car Evaluation CART-d1-a0-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259
Car Evaluation CART-d1-a0.01-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259
Car Evaluation CART-d1-a0.1-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259
Car Evaluation CART-d2-a0-s1 train acc: 0.7627314814814815 val acc: 0.7893518518518519
Car Evaluation CART-d2-a0.01-s1 train acc: 0.7627314814814815 val acc: 0.7893518518518519
Car Evaluation CART-d2-a0.1-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259
Car Evaluation CART-d3-a0-s1 train acc: 0.7974537037037037 val acc: 0.7615740740740741
Car Evaluation CART-d3-a0.01-s1 train acc: 0.7974537037037037 val acc: 0.7615740740740741
Car Evaluation CART-d3-a0.1-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259
Car Evaluation CART-d4-a0-s1 train acc: 0.8495370370370371 val acc: 0.8287037037037037
Car Evaluation CART-d4-a0.01-s1 train acc: 0.8495370370370371 val acc: 0.8287037037037037
Car Evaluation CART-d4-a0

# OCT

In [None]:
for data in datasets:
    # load data
    x, y = ds.loadData(data)
    for s in seeds:
        # data splition
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                        test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
        # Set min_samples_leaf to 1/25 of the training set size
        min_samples_leaf = math.floor(len(x_train)/25)
        for d in depth:
            for a in alpha:
                # oct with warm start
                row = ResOCTWS[(ResOCTWS['instance'] == data) & (ResOCTWS['depth'] == d) & 
                              (ResOCTWS['alpha'] == a) & (ResOCTWS['seed'] == s)]
                if len(row):
                    print(data, 'OCTWS-d{}-a{}-s{}'.format(row['depth'].values[0],row['alpha'].values[0], row['seed'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = miptree.optimalDecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=True,
                                                                   timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    ResOCTWS = ResOCTWS._append(row, ignore_index=True)
                    ResOCTWS.to_csv('./res/OCTWS.csv', index=False)
                    print(data, 'OCTWS-d{}-a{}-s{}'.format(d,a,s), 
                          'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)
                    
                # oct without warm start
                row = ResOCTNoWS[(ResOCTNoWS['instance'] == data) & (ResOCTNoWS['depth'] == d) & 
                              (ResOCTNoWS['alpha'] == a) & (ResOCTNoWS['seed'] == s)]
                if len(row):
                    print(data, 'OctNoWS-d{}-a{}-s{}'.format(row['depth'].values[0],row['alpha'].values[0], row['seed'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = miptree.optimalDecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=False,
                                                                   timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    ResOCTNoWS = ResOCTNoWS._append(row, ignore_index=True)
                    ResOCTNoWS.to_csv('./res/OCTNoWS.csv', index=False)
                    print(data, 'OCTNoWS-d{}-a{}-s{}'.format(d,a,s), 
                          'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)

# OCT modifed version

In [4]:
for data in datasets:
    # load data
    x, y = ds.loadData(data)
    for s in seeds:
        # data splition
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                        test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
        # Set min_samples_leaf to 1/25 of the training set size
        min_samples_leaf = math.floor(len(x_train)/25)
        for d in depth:
            for a in alpha:
                # oct with warm start
                row = ResOCTWSModified[(ResOCTWSModified['instance'] == data) & (ResOCTWSModified['depth'] == d) & 
                              (ResOCTWSModified['alpha'] == a) & (ResOCTWSModified['seed'] == s)]
                if len(row):
                    print(data, 'OCTWSModified-d{}-a{}-s{}'.format(row['depth'].values[0],row['alpha'].values[0], row['seed'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = tree.optimalDecisionTreeClassifierModified(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=True,
                                                                    timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    ResOCTWSModified = ResOCTWSModified._append(row, ignore_index=True)
                    ResOCTWSModified.to_csv('./res/OCTWSModified.csv', index=False)
                    print(data, 'OCTWSModified-d{}-a{}-s{}'.format(d,a,s), 
                            'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)

Breast Cancer OCTWSModified-d1-a0.0-s1 train acc: 0.7391304347826086 val acc: 0.6376811594202898 gap: 0.0
Breast Cancer OCTWSModified-d1-a0.01-s1 train acc: 0.7391304347826086 val acc: 0.6376811594202898 gap: 0.0
Breast Cancer OCTWSModified-d1-a0.1-s1 train acc: 0.6811594202898551 val acc: 0.6666666666666666 gap: 0.0
Breast Cancer OCTWSModified-d2-a0.0-s1 train acc: 0.782608695652174 val acc: 0.6231884057971014 gap: 0.0
Breast Cancer OCTWSModified-d2-a0.01-s1 train acc: 0.782608695652174 val acc: 0.6231884057971014 gap: 0.0
Breast Cancer OCTWSModified-d2-a0.1-s1 train acc: 0.6811594202898551 val acc: 0.6666666666666666 gap: 0.0
Breast Cancer OCTWSModified-d3-a0.0-s1 train acc: 0.8260869565217391 val acc: 0.6956521739130435 gap: 1.0
Breast Cancer OCTWSModified-d3-a0.01-s1 train acc: 0.8260869565217391 val acc: 0.6956521739130435 gap: 0.8689895470383275
Breast Cancer OCTWSModified-d3-a0.1-s1 train acc: 0.6811594202898551 val acc: 0.6666666666666666 gap: 0.1227272727272725
Breast Cancer O

# Intances where OCT failed to predict out of sample observation points

In [4]:
datasets = ["Ionosphere", "Iris", "Statlog (German Credit Data)","Wine"]
alpha = [0]
depth = [4]
seeds = [1, 2, 3]
for data in datasets:
    # load data
    x, y = ds.loadData(data)
    for s in seeds:
        # data splition
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                        test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
        # Set min_samples_leaf to 1/25 of the training set size
        min_samples_leaf = math.floor(len(x_train)/25)
        for d in depth:
            for a in alpha:
                octree = miptree.optimalDecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=True,
                                                                timelimit=timelimit, output=False)
                tick = time.time()
                octree.fit(x_train, y_train)
                tock = time.time()
                train_time = tock - tick
                train_acc = accuracy_score(y_train, octree.predict(x_train))
                print(data, 'OCTWS-d{}-a{}-s{}'.format(d,a,s), 
                        'train acc:', train_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                #print(row)

Set parameter Username
Academic license - for non-commercial use only - expires 2024-10-02
Ionosphere OCTWS-d4-a0-s1 train acc: 0.9257142857142857 train_time: 601.3799159526825 gap: 1.0
Ionosphere OCTWS-d4-a0-s2 train acc: 0.9542857142857143 train_time: 601.3272738456726 gap: 1.0
Ionosphere OCTWS-d4-a0-s3 train acc: 0.88 train_time: 601.0696859359741 gap: 1.0
Iris OCTWS-d4-a0-s1 train acc: 1.0 train_time: 424.57993483543396 gap: 0.0
Iris OCTWS-d4-a0-s2 train acc: 1.0 train_time: 26.140833139419556 gap: 0.0
Iris OCTWS-d4-a0-s3 train acc: 1.0 train_time: 10.247819900512695 gap: 0.0
Statlog (German Credit Data) OCTWS-d4-a0-s1 train acc: 0.722 train_time: 602.2233679294586 gap: 1.0
Statlog (German Credit Data) OCTWS-d4-a0-s2 train acc: 0.764 train_time: 602.7167911529541 gap: 1.0
Statlog (German Credit Data) OCTWS-d4-a0-s3 train acc: 0.744 train_time: 602.5008709430695 gap: 1.0
Wine OCTWS-d4-a0-s1 train acc: 1.0 train_time: 15.263780117034912 gap: 0.0
Wine OCTWS-d4-a0-s2 train acc: 1.0 tra