In [1]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
#Reading data from CSV file
my_data = pd.read_csv("Iris.csv", delimiter=",")
my_data[0:5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
#Defining data and label
X = my_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'petal_width']].values
X[0:5]

array([[5.1, 3.5, 1.4, 0.2, 0.2],
       [4.9, 3. , 1.4, 0.2, 0.2],
       [4.7, 3.2, 1.3, 0.2, 0.2],
       [4.6, 3.1, 1.5, 0.2, 0.2],
       [5. , 3.6, 1.4, 0.2, 0.2]])

In [4]:
#Defining data and label
y = my_data["species"]
y[0:5]

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: species, dtype: object

### Setting up the Decision Tree 

In [52]:
from sklearn.model_selection import train_test_split

In [80]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=2)

In [81]:
print('There are {} samples in the training set and {} samples in the test set'.format(X_trainset.shape[0], X_testset.shape[0]))

There are 105 samples in the training set and 45 samples in the test set


### Modelling 

In [82]:
speciesTree = DecisionTreeClassifier(criterion="gini", max_depth = 4)
speciesTree # it shows the default parameters

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [83]:
speciesTree.fit(X_trainset,y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

### Prediction 

In [84]:
predTree = speciesTree.predict(X_testset)

In [85]:
print (predTree [0:5])
print (y_testset [0:5])

['Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa']
6         Iris-setosa
3         Iris-setosa
113    Iris-virginica
12        Iris-setosa
24        Iris-setosa
Name: species, dtype: object


### Evaluation 

In [86]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.9555555555555556


In [77]:
#Print performance
print('The accuracy of the Decision Tree classifier on training data is {:.2f}'.format(speciesTree.score(X_trainset, y_trainset)))
print('The accuracy of the Decision Tree classifier on test data is {:.2f}'.format(speciesTree.score(X_testset, y_testset)))

The accuracy of the Decision Tree classifier on training data is 1.00
The accuracy of the Decision Tree classifier on test data is 0.96


### Visualization 

In [16]:
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 

In [17]:
dot_data = StringIO()
filename = "speciestree.png"
featureNames = my_data.columns[0:5]
targetNames = my_data["species"].unique().tolist()
out=tree.export_graphviz(speciesTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

InvocationException: GraphViz's executables not found