## [作業重點]
目前你應該已經要很清楚資料集中，資料的型態是什麼樣子囉！包含特徵 (features) 與標籤 (labels)。因此要記得未來不管什麼專案，必須要把資料清理成相同的格式，才能送進模型訓練。
今天的作業開始踏入決策樹這個非常重要的模型，請務必確保你理解模型中每個超參數的意思，並試著調整看看，對最終預測結果的影響為何

## 作業

1. 試著調整 DecisionTreeClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型的結果進行比較

In [1]:
from sklearn import datasets, metrics
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
data = datasets.load_iris()
dir(data)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [3]:
print(data.feature_names)
print(data.data.shape)
print(np.unique(data.target))

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(150, 4)
[0 1 2]


In [4]:
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=20)

clf = DecisionTreeClassifier(criterion = 'entropy')

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.9210526315789473
Feature importance:  [0.         0.01552364 0.6424512  0.34202515]
leaves number: 6
depth: 4


In [5]:
clf = DecisionTreeClassifier(splitter = 'random')

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.9473684210526315
Feature importance:  [0.         0.03827751 0.64739703 0.31432546]
leaves number: 9
depth: 6


In [6]:
clf = DecisionTreeClassifier(max_depth = 2)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.8947368421052632
Feature importance:  [0. 0. 0. 1.]
leaves number: 3
depth: 2


In [7]:
clf = DecisionTreeClassifier(min_samples_split = 0.1)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.9210526315789473
Feature importance:  [0.         0.         0.03489969 0.96510031]
leaves number: 5
depth: 3


In [8]:
clf = DecisionTreeClassifier(min_samples_leaf = 5)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.8947368421052632
Feature importance:  [0.00486998 0.         0.00322674 0.99190328]
leaves number: 5
depth: 3


In [9]:
clf = DecisionTreeClassifier(min_weight_fraction_leaf = .01)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.8947368421052632
Feature importance:  [0.         0.         0.54250618 0.45749382]
leaves number: 5
depth: 3


In [10]:
clf = DecisionTreeClassifier(min_impurity_decrease = .01)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.9210526315789473
Feature importance:  [0.         0.         0.54493517 0.45506483]
leaves number: 4
depth: 3


In [11]:
clf = DecisionTreeClassifier(ccp_alpha = .01)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Acuuracy: ", acc)

print("Feature importance: ", clf.feature_importances_)

print('leaves number:', clf.get_n_leaves())

print('depth:', clf.get_depth())

Acuuracy:  0.9210526315789473
Feature importance:  [0.        0.        0.0267478 0.9732522]
leaves number: 4
depth: 3


#### boston 

In [12]:
data = datasets.load_boston()
print(data.feature_names)
print(data.data.shape)
print(np.unique(data.target))

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
(506, 13)
[ 5.   5.6  6.3  7.   7.2  7.4  7.5  8.1  8.3  8.4  8.5  8.7  8.8  9.5
  9.6  9.7 10.2 10.4 10.5 10.8 10.9 11.  11.3 11.5 11.7 11.8 11.9 12.
 12.1 12.3 12.5 12.6 12.7 12.8 13.  13.1 13.2 13.3 13.4 13.5 13.6 13.8
 13.9 14.  14.1 14.2 14.3 14.4 14.5 14.6 14.8 14.9 15.  15.1 15.2 15.3
 15.4 15.6 15.7 16.  16.1 16.2 16.3 16.4 16.5 16.6 16.7 16.8 17.  17.1
 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18.  18.1 18.2 18.3 18.4 18.5
 18.6 18.7 18.8 18.9 19.  19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9
 20.  20.1 20.2 20.3 20.4 20.5 20.6 20.7 20.8 20.9 21.  21.1 21.2 21.4
 21.5 21.6 21.7 21.8 21.9 22.  22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8
 22.9 23.  23.1 23.2 23.3 23.4 23.5 23.6 23.7 23.8 23.9 24.  24.1 24.2
 24.3 24.4 24.5 24.6 24.7 24.8 25.  25.1 25.2 25.3 26.2 26.4 26.5 26.6
 26.7 27.  27.1 27.5 27.9 28.  28.1 28.2 28.4 28.5 28.6 28.7 29.  29.1
 29.4 29.6 29.8 29.9 30.1 30.3 30.5 30.7 30.8 31.  3

In [13]:
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=20)

rg = DecisionTreeRegressor()

rg.fit(x_train, y_train)

y_pred = rg.predict(x_test)

mse = metrics.mean_squared_error(y_test, y_pred)

print("MSE: ", mse)

print("Feature importance: ", rg.feature_importances_)

print('leaves number:', rg.get_n_leaves())

print('depth:', rg.get_depth())

MSE:  40.92346456692914
Feature importance:  [4.93714141e-02 8.13564246e-04 3.09783599e-03 1.37246492e-04
 9.41801239e-03 6.07024714e-01 1.28471745e-02 7.44716310e-02
 1.72502970e-03 9.55463238e-03 1.84081144e-02 5.05956246e-03
 2.08071068e-01]
leaves number: 354
depth: 17


In [14]:
rg = DecisionTreeRegressor(criterion = 'mae')

rg.fit(x_train, y_train)

y_pred = rg.predict(x_test)

mse = metrics.mean_squared_error(y_test, y_pred)

print("MSE: ", mse)

print("Feature importance: ", rg.feature_importances_)

print('leaves number:', rg.get_n_leaves())

print('depth:', rg.get_depth())

MSE:  16.707874015748033
Feature importance:  [0.11164065 0.00730768 0.01660114 0.00154891 0.03316256 0.40247826
 0.03435403 0.07446682 0.00742682 0.0180309  0.0327654  0.02466341
 0.23555344]
leaves number: 355
depth: 23


In [15]:
rg = DecisionTreeRegressor(min_impurity_decrease = 1)

rg.fit(x_train, y_train)

y_pred = rg.predict(x_test)

mse = metrics.mean_squared_error(y_test, y_pred)

print("MSE: ", mse)

print("Feature importance: ", rg.feature_importances_)

print('leaves number:', rg.get_n_leaves())

print('depth:', rg.get_depth())

MSE:  21.69077210711671
Feature importance:  [0.04810468 0.         0.         0.         0.         0.68227455
 0.         0.06528276 0.         0.         0.         0.
 0.204338  ]
leaves number: 8
depth: 4
