In [1]:
# first we load the dataset we are going to use
# in this case the Iris dataset
from sklearn.datasets import load_iris

# and load the decision tree class from scikit-learn
from sklearn.tree import DecisionTreeClassifier

# import pandas
import pandas as pd

# import graphic library
from sklearn.tree import export_graphviz

In [7]:
# load the iris dataset as an array
iris = load_iris()

# convert to dataframe for visualization and set column names
iris_df = pd.DataFrame(
    data=iris.data,
    columns=['Sepal length (cm)', 'Sepal width (cm)', 'Petal length (cm)', 'Petal width (cm)']
)

# display first 5 rows from dataframe
iris_df.head()

Unnamed: 0,Sepal length (cm),Sepal width (cm),Petal length (cm),Petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
# for this model, we just need the last two columns
# petal length and width
X = iris.data[:, 2:] 
y = iris.target

# display only last two columns
iris_df.iloc[:, 2:]

Unnamed: 0,Petal length (cm),Petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


In [9]:
# now we train a decision tree classifier
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

In [10]:
# predict a random instance to test the tree_clf
# 0 is iris setosa
# 1 is iris versicolor
# 2 is iris virginica
tree_clf.predict([[2.85, 1.2]]) # 2.85 cm length and 1.2 cm width

array([1])

In [11]:
# you can also get the probability in percentage for each class
# 0% for setosa, 
# 90.7% for versicolor 
# 9.2% for virginica
tree_clf.predict_proba([[2.85, 1.2]])

array([[0.        , 0.90740741, 0.09259259]])

In [5]:
# now, we use export_graphviz to visualize the trained tree clf
export_graphviz(
    tree_clf, # classifier
    out_file='iris_tree.dot', # output path
    feature_names=iris.feature_names[2:], # column names
    class_names=iris.target_names, 
    rounded=True,
    filled=True,
)