In [96]:
import numpy as np # to analyze the structure of the decision tree
import pandas as pd # for data manipulation
import matplotlib.pyplot as plt # to plot the decision tree and create an image
import pickle  # to save and load the decision tree estimator
from sklearn.tree import DecisionTreeClassifier, plot_tree # create the decision tree classifier and explore its structure
from sklearn.model_selection import train_test_split # to manage the training and testing data
from sklearn import metrics # metrics module and is used to measure the accuracy of the training process
import os # for the file path management of the dataset


In [97]:
# @title Importing data <br>
# Set repository to “github”(default) to read the data
# from GitHub <br>
# Set repository to “google” to read the data
# from Google {display-mode: “form”}
ip = './files/autopilot_data.csv'


In [None]:
col_names = ['f1', 'f2', 'f3', 'f4', 'label']
# load dataset
pima = pd.read_csv(ip, header=None, names=col_names)
pima.head()

In [None]:
feature_col = ['f1', 'f2', 'f3', 'f4']
X = pima[feature_col] # Features
y = pima.label
print(X)
print(y)

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1) # 70% training and 30% test


In [None]:
# Create decision tree classifier object
# Default approach
estimator = DecisionTreeClassifier()
# estimator = DecisionTreeClassifier(max_depth=2, max_leaf_nodes=3,min_samples_leaf=100)
print(estimator)

In [158]:
estimator = estimator.fit(X_train,y_train)
# Predict the response for the test dataset
# print("prediction")
# y_pred = estimator.predict(X_test)
# print(y_pred)

In [None]:
# Model accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [160]:
# save model
pickle.dump(estimator, open("./files/dt.sav", 'wb'))

In [None]:
from matplotlib.pyplot import figure
plt.figure(dpi=400, edgecolor="r", figsize=(10, 10))

In [162]:
F = ["f1", "f2" "f3", "f4","label"]
C = ["Right", "Left"]



In [None]:
plot_tree(estimator, filled=True, feature_names=F, rounded=True,precision=2, fontsize=3, proportion=True, max_depth=2,class_names=C)
# plot_tree(estimator, filled=True, feature_names=F)

In [None]:
plt.savefig('./files/dt.png')

In [165]:
plt.show()

In [None]:
# estimator contains a tree_ object that stores the attributes of the structure of a decision tree in arrays
estimator.tree_

In [None]:
# count the number of nodes
n_nodes = estimator.tree_.node_count
n_nodes

In [None]:
# obtain the ID of the left child of a node
children_left = estimator.tree_.children_left
children_left

In [None]:
# obtain the ID of the right child of a node
children_right = estimator.tree_.children_right
children_right

In [None]:
# feature used to split the node into the left and right child nodes
feature = estimator.tree_.feature
feature

In [None]:
# A threshold attribute will show the value at the node
threshold = estimator.tree_.threshold
threshold

In [172]:
# parsing the tree structure
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # the seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1
    # Exploring the test mode
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

In [None]:
print("The binary tree structure has %s nodes and has the following tree structure:"% n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s "\
              "if X[:, %s] <= %s else to node %s."\
              % (node_depth[i] * "\t", i,\
                 children_left[i],\
                 feature[i],\
                 threshold[i],\
                 children_right[i],))
                 

In [None]:
estimator = DecisionTreeClassifier(max_depth=2, max_leaf_nodes=3,min_samples_leaf=100)
estimator