# Comparing CART and MIO methods to create decision trees

Import packages needed for CART and MIO methods

In [24]:
import tree as miptree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree
import time
from os import path
import pandas as pd
import dataset as ds
import math
import oct_b
import oct_b_scale
from ucimlrepo import list_available_datasets

In [25]:
timelimit = 600
datasets = ['Breast Cancer', 'Car Evaluation', 'Credit Approval', 'Heart Disease', 'Mushroom', 'Zoo']
# datasets = ['Breast Cancer', 'Car Evaluation', 'Credit Approval', 'Heart Disease', 'Mushroom', 'Zoo']
alpha = [0]#, 0.01, 0.1]
# depth = [1, 2, 3, 4]
depth = [4]
seeds = [1, 2, 3]
train_ratio = 0.5
val_ratio = 0.25
test_ratio = 0.25

In [26]:
# create or load table
ResCART = pd.DataFrame(columns=['instance', 'depth', 'seed', 'train_acc', 'val_acc', 'test_acc', 'train_time'])
if path.isfile('./res/OCTWS.csv'):
    ResOCTWS = pd.read_csv('./res/OCTWS.csv')
else:
    ResOCTWS = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])
if path.isfile('./res/OCTNoWS.csv'):
    ResOCTNoWS = pd.read_csv('./res/OCTNoWS.csv')
else:
    ResOCTNoWS = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])
if path.isfile('./res/OCT_b.csv'):
    ResOCT_b = pd.read_csv('./res/OCT_b.csv')
else:
    ResOCT_b = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])
if path.isfile('./res/OCT_b_scale.csv'):
    ResOCT_b_scale = pd.read_csv('./res/OCT_b_scale.csv')
else:
    ResOCT_b_scale = pd.DataFrame(columns=['instance', 'depth', 'alpha', 'seed', 
                                    'train_acc', 'val_acc', 'test_acc', 'train_time', 'gap'])

# CART

In [23]:
#CART
for data in datasets:
    # load data
    x, y = ds.loadData(data)
    for s in seeds:
        # split data
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                        test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
        # Set min_samples_leaf to 1/25 of the training set size
        min_samples_leaf = math.floor(len(x_train)/25)
        for d in depth:
            for a in alpha:
                clf = tree.DecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, ccp_alpha=a)
                tick = time.time()
                clf.fit(x_train, y_train)
                tock = time.time()
                train_time = tock - tick
                train_acc = accuracy_score(y_train, clf.predict(x_train))
                val_acc = accuracy_score(y_val, clf.predict(x_val))
                test_acc = accuracy_score(y_test, clf.predict(x_test))
                print(data, 'CART-d{}-a{}-s{}'.format(d,a,s), 'train acc:', train_acc, 'val acc:', val_acc)
                row = {'instance':data, 'depth':d, 'seed':s, 'train_acc':train_acc, 
                    'val_acc':val_acc, 'test_acc':test_acc, 'train_time':train_time}
                ResCART = ResCART._append(row, ignore_index=True)
                ResCART.to_csv('./res/CART.csv', index=False)

Breast Cancer CART-d4-a0-s1 train acc: 0.8260869565217391 val acc: 0.6811594202898551
Breast Cancer CART-d4-a0-s2 train acc: 0.8043478260869565 val acc: 0.7681159420289855
Breast Cancer CART-d4-a0-s3 train acc: 0.8333333333333334 val acc: 0.8115942028985508


  ResCART = ResCART._append(row, ignore_index=True)


KeyboardInterrupt: 

# OCT

In [27]:
for data in datasets:
    # load data
    x, y = ds.loadData(data)
    for s in seeds:
        # data splition
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                        test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
        # Set min_samples_leaf to 1/25 of the training set size
        min_samples_leaf = math.floor(len(x_train)/25)
        for d in depth:
            for a in alpha:
                # oct with warm start
                row = ResOCTWS[(ResOCTWS['instance'] == data) & (ResOCTWS['depth'] == d) & 
                              (ResOCTWS['alpha'] == a) & (ResOCTWS['seed'] == s)]
                if len(row):
                    print(data, 'OCTWS-d{}-a{}-s{}'.format(row['depth'].values[0],row['alpha'].values[0], row['seed'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = miptree.optimalDecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=True,
                                                                   timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    ResOCTWS = ResOCTWS._append(row, ignore_index=True)
                    ResOCTWS.to_csv('./res/OCTWS.csv', index=False)
                    print(data, 'OCTWS-d{}-a{}-s{}'.format(d,a,s), 
                          'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)
                    
                # oct without warm start
                row = ResOCTNoWS[(ResOCTNoWS['instance'] == data) & (ResOCTNoWS['depth'] == d) & 
                              (ResOCTNoWS['alpha'] == a) & (ResOCTNoWS['seed'] == s)]
                if len(row):
                    print(data, 'OctNoWS-d{}-a{}-s{}'.format(row['depth'].values[0],row['alpha'].values[0], row['seed'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = miptree.optimalDecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=False,
                                                                   timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    ResOCTNoWS = ResOCTNoWS._append(row, ignore_index=True)
                    ResOCTNoWS.to_csv('./res/OCTNoWS.csv', index=False)
                    print(data, 'OCTNoWS-d{}-a{}-s{}'.format(d,a,s), 
                          'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)

Breast Cancer OCTWS-d4-a0.0-s1 train acc: 0.8115942028985508 val acc: 0.6376811594202898 gap: 1.0
Breast Cancer OctNoWS-d4-a0.0-s1 train acc: 0.8115942028985508 val acc: 0.6376811594202898 gap: 1.0
Breast Cancer OCTWS-d4-a0.0-s2 train acc: 0.8333333333333334 val acc: 0.6521739130434783 gap: 1.0
Breast Cancer OctNoWS-d4-a0.0-s2 train acc: 0.8043478260869565 val acc: 0.6956521739130435 gap: 1.0
Breast Cancer OCTWS-d4-a0.0-s3 train acc: 0.8478260869565217 val acc: 0.7971014492753623 gap: 1.0
Breast Cancer OctNoWS-d4-a0.0-s3 train acc: 0.8623188405797102 val acc: 0.7971014492753623 gap: 1.0
Car Evaluation OCTWS-d4-a0.0-s1 train acc: 0.8090277777777778 val acc: 0.8148148148148148 gap: 0.993939393939394
Car Evaluation OctNoWS-d4-a0.0-s1 train acc: 0.8090277777777778 val acc: 0.8148148148148148 gap: 0.993939393939394
Car Evaluation OCTWS-d4-a0.0-s2 train acc: 0.8067129629629629 val acc: 0.8263888888888888 gap: 1.0
Car Evaluation OctNoWS-d4-a0.0-s2 train acc: 0.8067129629629629 val acc: 0.8263

In [15]:
print(data, d, a, s)

Wine 4 0 1


In [5]:
datasets = ['Car Evaluation', 'Credit Approval', 'Adult', 'Breast Cancer','Heart Disease',
             'Ionosphere','Iris', 'Mushroom', 'Wine', 'Zoo']
for data in datasets:
    # load data
    x, y = ds.loadData(data)
    for s in seeds:
        # data splition
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                        test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
        # Set min_samples_leaf to 1/25 of the training set size
        min_samples_leaf = math.floor(len(x_train)/25)
        for d in depth:
            for a in alpha:
                # oct with warm start including b
                row = ResOCT_b[(ResOCT_b['instance'] == data) & (ResOCT_b['depth'] == d) & 
                              (ResOCT_b['alpha'] == a) & (ResOCT_b['seed'] == s)]
                if len(row):
                    print(data, 'OCT_b-d{}-a{}-s{}'.format(row['depth'].values[0],row['alpha'].values[0], row['seed'].values[0]),
                          'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                          'gap:', row['gap'].values[0])
                else:
                    octree = oct_b.optimalDecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=True,
                                                                   timelimit=timelimit, output=False)
                    tick = time.time()
                    octree.fit(x_train, y_train)
                    tock = time.time()
                    train_time = tock - tick
                    train_acc = accuracy_score(y_train, octree.predict(x_train))
                    val_acc = accuracy_score(y_val, octree.predict(x_val))
                    test_acc = accuracy_score(y_test, octree.predict(x_test))
                    row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                           'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                    ResOCT_b = ResOCT_b._append(row, ignore_index=True)
                    ResOCT_b.to_csv('./res/OCT_b.csv', index=False)
                    print(data, 'OCT_b-d{}-a{}-s{}'.format(d,a,s), 
                          'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                    #print(row)
                    
                # # oct without warm start
                # row = ResOCT_b_scale[(ResOCT_b_scale['instance'] == data) & (ResOCT_b_scale['depth'] == d) & 
                #               (ResOCT_b_scale['alpha'] == a) & (ResOCT_b_scale['seed'] == s)]
                # if len(row):
                #     print(data, 'OCT_b_scale-d{}-a{}-s{}'.format(row['depth'].values[0],row['alpha'].values[0], row['seed'].values[0]),
                #           'train acc:', row['train_acc'].values[0], 'val acc:', row['val_acc'].values[0],
                #           'gap:', row['gap'].values[0])
                # else:
                #     octree = oct_b_scale.optimalDecisionTreeClassifier(max_depth=d, min_samples_leaf=min_samples_leaf, alpha=a, warmstart=False,
                #                                                    timelimit=timelimit, output=False)
                #     tick = time.time()
                #     octree.fit(x_train, y_train)
                #     tock = time.time()
                #     train_time = tock - tick
                #     train_acc = accuracy_score(y_train, octree.predict(x_train))
                #     val_acc = accuracy_score(y_val, octree.predict(x_val))
                #     test_acc = accuracy_score(y_test, octree.predict(x_test))
                #     row = {'instance':data, 'depth':d, 'alpha':a, 'seed':s, 'train_acc':train_acc, 'val_acc':val_acc,
                #            'test_acc':test_acc, 'train_time':train_time, 'gap':octree.optgap}
                #     ResOCT_b_scale = ResOCT_b_scale._append(row, ignore_index=True)
                #     ResOCT_b_scale.to_csv('./res/OCT_b_scale.csv', index=False)
                #     print(data, 'OCT_b_scale-d{}-a{}-s{}'.format(d,a,s), 
                #           'train acc:', train_acc, 'val acc:', val_acc, 'train_time:', train_time, 'gap:', octree.optgap)
                #     #print(row)

Car Evaluation OCT_b-d1-a0.0-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259 gap: 0.0
Car Evaluation OCT_b-d1-a0.01-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259 gap: 0.0
Car Evaluation OCT_b-d1-a0.1-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259 gap: 0.0
Car Evaluation OCT_b-d2-a0.0-s1 train acc: 0.7638888888888888 val acc: 0.7476851851851852 gap: 0.9362745098039216
Car Evaluation OCT_b-d2-a0.01-s1 train acc: 0.7638888888888888 val acc: 0.7476851851851852 gap: 0.8747655000598932
Car Evaluation OCT_b-d2-a0.1-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259 gap: 0.0
Car Evaluation OCT_b-d3-a0.0-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259 gap: 0.996124031007752
Car Evaluation OCT_b-d3-a0.01-s1 train acc: 0.8009259259259259 val acc: 0.8009259259259259 gap: 0.9260108953020004
Car Evaluation OCT_b-d3-a0.1-s1 train acc: 0.7013888888888888 val acc: 0.6759259259259259 gap: 0.510142118863049
Car Evaluation OCT_b-d1-a0.0-s2 t

KeyError: 6