# Decision Tree Demonstration
This notebook demonstrate how decision tree can be used for a classification problem while keeping the discovered rules readable for a human being with python scikit-learn.

The chosen dataset is the Iris in which a plant species is classified based on numeric information about physical characteristics.

In [11]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from IPython.display import Image  
from sklearn.tree import export_graphviz
from subprocess import call
import pydotplus

## Step 1 - Loading data

In [12]:
raw_data = read_csv('iris.csv')
raw_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
raw_data.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [14]:
class_labels = raw_data.Species.unique()
class_labels

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [15]:
feature_columns = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
label_columns   = ['Species']

## Step 2 - Split data in train and test

In [16]:
x = raw_data.filter(feature_columns).as_matrix()
y = raw_data.filter(label_columns).as_matrix()

'x:' + str(x.shape) + ' y:' + str(y.shape)

'x:(150, 4) y:(150, 1)'

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.70, random_state = 100)
'x_train:' + str(x_train.shape) + ' y_train:' + str(y_train.shape) + ' x_test: ' + str(x_test.shape)  + ' y_test: ' + str(y_test.shape)

'x_train:(105, 4) y_train:(105, 1) x_test: (45, 4) y_test: (45, 1)'

## Step 3 - Train decision tree 

In [18]:
iris_tree = DecisionTreeClassifier().fit(x_train, y_train)

## Step 4 - Test decision tree

In [19]:
y_pred = iris_tree.predict(x_test)
'Decision tree accuracy is ' + str(accuracy_score(y_test, y_pred))

'Decision tree accuracy is 0.955555555556'

## Step 5 - Plot decision tree

In [20]:
dot_data = export_graphviz(iris_tree, out_file=None, 
                    feature_names=feature_columns,  
                    class_names=class_labels,  
                    filled=True, rounded=True,  
                    special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

InvocationException: GraphViz's executables not found