# Optimal Classification Tree


# Library Settings

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
import random
from OCT_tree import RegularOptimalTreeClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree 

import graphviz

#%load_ext autoreload
#%autoreload 2


# Data Processing


In [2]:
file_path = "C:\PythonProject\InvalidTreeTest\dataset"
iris_data_path =file_path + "\iris.csv"
iris_data = pd.read_csv(iris_data_path)
iris_data.drop(columns=["Id"], inplace=True)

abalone_data_path = file_path + "\\abalone.data"
abalone_data = pd.read_csv(abalone_data_path)
abalone_data['M'] = abalone_data['M'].replace(['M'],'0')
abalone_data['M'] = abalone_data['M'].replace(['F'],'1')
abalone_data['M'] = abalone_data['M'].replace(['I'],'2')

white_wine_data_path = file_path + "\winequality-white1.txt"
white_wine_data = pd.read_csv(white_wine_data_path,sep=";")

red_wine_data_path = file_path + "\winequality-red1.txt"
red_wine_data = pd.read_csv(red_wine_data_path,sep=";")

rasin_data_path = file_path + "\\rasin.csv"
rasin_data = pd.read_csv(rasin_data_path)

drybean_data_path = file_path + "\drybean.csv"
drybean_data = pd.read_csv(drybean_data_path)

cancer_data_path = file_path + "\\breast-cancer-wisconsin.data"
breastcancerwisconsin = pd.read_csv(cancer_data_path)
breastcancerwisconsin.drop(columns=['1000025'],inplace=True)
index = breastcancerwisconsin[ (breastcancerwisconsin['5'] == '?') | (breastcancerwisconsin['1'] == '?') \
                                |(breastcancerwisconsin['1.1'] == '?')|(breastcancerwisconsin['1.2'] == '?')\
                                |(breastcancerwisconsin['2'] == '?')|(breastcancerwisconsin['1.3'] == '?')\
                                |(breastcancerwisconsin['3'] == '?')|(breastcancerwisconsin['1.4'] == '?')\
                                |(breastcancerwisconsin['1.5'] == '?')|(breastcancerwisconsin['2.1'] == '?')].index
breastcancerwisconsin.drop(index , inplace=True)

car_data_path = file_path + "\car.data"
car = pd.read_csv(car_data_path)
car['vhigh'].replace(['high', 'low','med','vhigh'],
                        [0, 1,2,3], inplace=True)
car['vhigh.1'].replace(['high', 'low','med','vhigh'],
                        [0, 1,2,3], inplace=True)
car['2'].replace(['2', '3','4','5more'],
                        [0, 1,2,3], inplace=True)
car['2.1'].replace(['2','4','more'],
                        [0, 1,2], inplace=True)
car['small'].replace(['small','med','big'],
                        [0, 1,2], inplace=True)
car['low'].replace(['low','med','high'],
                        [0, 1,2], inplace=True)
car['unacc'].replace(['unacc','acc','good','vgood'],
                        [0, 1,2,3], inplace=True)

data_list = [iris_data,abalone_data,white_wine_data,red_wine_data,rasin_data,drybean_data,breastcancerwisconsin,car]
xy_list = []
for data in data_list:
    xy_list.append([data.iloc[:,0:len(data.columns)-1].to_numpy(),data[data.columns[len(data.columns)-1]].to_numpy()])


### M-OCT Test

The M-OCT is a modified version of the original OCT formulation (Bertsimas & Dunn, 2017). In the M-OCT formulation, we consider new leaf-branch-interaction constraints to ensure valid tree structure when learning the optimal trees.

In [3]:
#get data sample from iris dataset
n=100
data_set = 0
samples = [[],[]]
for t in range(n):

    random_number=random.randint(0,len(xy_list[data_set][0])-1)
    samples[0].append(xy_list[data_set][0][random_number])
    samples[1].append(xy_list[data_set][1][random_number])
    
X = samples[0]
y = samples[1]





# OCT parameters
max_depth = 3
min_samples_leaf = 1
alpha = 0.01
time_limit = 5  # minute
mip_gap_tol = 0.01  # optimal gap percentage
mip_focus = 'optimal'
mip_polish_time = None
warm_start = False
log_file = None

#model, run_time, solution_condition = solve_oct_MILP(X_transformed, y_transformed, L_hat, epsilons,
#                   alpha=0.01, max_depth=2, min_samples_leaf=1, small_number = 0.0000001,
#                   solver="gurobi", small_epsilon=False,epsilon_version=False,verbose=False, log_file=None)

# Construct OCT classifier
oct_model = RegularOptimalTreeClassifier(max_depth=max_depth,
                                  min_samples_leaf=min_samples_leaf,
                                  alpha=alpha,
                                  criterion="gini",
                                  solver="gurobi",
                                  time_limit=time_limit,
                                  small_epsilon=False,
                                  epsilon_version = True,
                                  verbose=True,
                                  warm_start=warm_start,
                                  log_file=log_file,
                                  solver_options={'mip_cuts': 'auto',
                                                  'mip_gap_tol': mip_gap_tol,
                                                  'mip_focus': mip_focus,
                                                  'mip_polish_time': mip_polish_time
                                                  }
                                  )

fitted_model = oct_model.fit(X, y)

Set parameter Username
Academic license - for non-commercial use only - expires 2023-09-30
Read LP format model from file C:\Users\huten\AppData\Local\Temp\tmp6dqh24cf.pyomo.lp
Reading time = 0.04 seconds
x1: 3416 rows, 914 columns, 19521 nonzeros
Set parameter TimeLimit to value 300
Set parameter MIPGap to value 0.01
Set parameter MIPFocus to value 2
Gurobi Optimizer version 9.5.2 build v9.5.2rc0 (win64)
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads
Optimize a model with 3416 rows, 914 columns and 19521 nonzeros
Model fingerprint: 0x0a899968
Variable types: 41 continuous, 873 integer (873 binary)
Coefficient statistics:
  Matrix range     [1e-06, 1e+02]
  Objective range  [1e-02, 2e-02]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+02]
Presolve removed 48 rows and 1 columns
Presolve time: 0.07s
Presolved: 3368 rows, 913 columns, 16969 nonzeros
Variable types: 0 continuous, 913 integer (873 binary)
Found heuristic solution: objective 0.82

In [4]:
# Fit
fitted_model.leaf_solution


{8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 39.0, 13: 33.0, 14: 28.0, 15: 0.0}

In [5]:
fitted_model.b_value

[[1, -0.0], [2, -0.0], [3, -0.0], [4, -0.0], [5, -0.0], [6, -0.0], [7, -0.0]]

In [6]:
fitted_model.d_value

[[1, 1], [2, -0.0], [3, -0.0], [4, -0.0], [5, -0.0], [6, -0.0], [7, -0.0]]

In [7]:
fitted_model.state

1