In [2]:
import pandas as pd  # python datasets 的縮寫
import numpy as np   # python numbers 的縮寫
import matplotlib.pyplot as plt  
from sklearn import datasets   # scikit-learn 的縮寫
from sklearn.tree import export_graphviz # 從 sklearn的決策樹模型 取得畫決策樹之套件

In [3]:
# 資料讀取
iris = datasets.load_iris()

# 資料當中表達花瓣資料型態的特徵
x = pd.DataFrame(iris['data'], columns = iris['feature_names'])
print(x)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]


In [4]:
# 資料當中表達資料分類的結果，並改為 pandas表格形式

y = pd.DataFrame(iris['target'], columns = ['target'] )
print('target_names :'+ str(iris['target_names']))

target_names :['setosa' 'versicolor' 'virginica']


In [5]:
# 把花瓣特徵與結果('target')的表格合併
iris_data = pd.concat([x,y],axis = 1)
iris_data = iris_data[['sepal length (cm)', 'petal length (cm)','target' ]]

# 我們先只處理兩個分類的情形 ('setosa = 0', 'versicolor = 1')
iris_data = iris_data[iris_data['target'].isin([0,1])]
iris_data

Unnamed: 0,sepal length (cm),petal length (cm),target
0,5.1,1.4,0
1,4.9,1.4,0
2,4.7,1.3,0
3,4.6,1.5,0
4,5.0,1.4,0
...,...,...,...
95,5.7,4.2,1
96,5.7,4.2,1
97,6.2,4.3,1
98,5.1,3.0,1


In [36]:
from sklearn.model_selection import train_test_split       # 資料切割器
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree  

In [9]:
# 將資料區分成 70% 訓練資料與 30% 測試資料

x_train,x_test,y_train,y_test = train_test_split(
    iris_data[['sepal length (cm)','petal length (cm)']],iris_data[['target']],
    test_size = 0.3, random_state = 0)

In [11]:
# 載入決策樹函式，criterion使用我們之前介紹過的entropy
tree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, random_state = 0)
tree.fit(x_train,y_train)

In [12]:
# 訓練完成以後，接下來就是測試預測結果(從沒有看過的30%資料來去測試)

# 決策樹的預測結果(從 'sepal length (cm)', 'petal length (cm)'分類出來的結果)
tree.predict(x_test)

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1])

In [15]:
# 真實的分類結果 

y_test['target'].values

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1])

In [16]:
# 看看是否有答錯
# i 表示30個順序、v 表示 target的內容

error = 0
for i, v in enumerate(tree.predict(x_test)):
    if v != y_test['target'].values[i]:
        print(i,v)
        error = error + 1
        
print(error)

0


In [26]:
test_score = tree.score(x_test,y_test['target'])
test_score

1.0

In [24]:
export_graphviz(tree, 
                out_file = 'tree.dot', 
                feature_names =['sepal length (cm)', 'petal length (cm)'],
                class_names = iris.target_names,
                rounded=True, # 開啟round
                proportion=False, # 不顯示比例 顯示target的數量
                precision=2, # 小數點後第二位
                filled=True) # 是否根據Target來填顏色))


In [34]:
# 印出預測精確率
print(f'Accuracy: {test_score:.1f}%')

# 印出文字版的決策樹
feature_names = iris_data[['sepal length (cm)', 'petal length (cm)']]
class_names = iris_data[['target']]

print(export_text(tree, feature_names=list(feature_names)))

Accuracy: 1.0%
|--- petal length (cm) <= 2.45
|   |--- class: 0
|--- petal length (cm) >  2.45
|   |--- class: 1

