## 결정 트리 학습과 시각화

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier # 트리 모듈 내의 결정 트리 분류 이용 

iris = load_iris()
X = iris.data[:, 2:] # 꽃잎 길이와 너비만 가져옴 
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X, y) # 학습


DecisionTreeClassifier(max_depth=2, random_state=42)

export_graphiz() 함수 -> 그래프를 .dot 파일로 출력하여 시각화 가능

In [2]:
''' 
import os
from graphviz import Source
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file = os.path.join("./", "iris_tree.dot"),
    feature_names = iris.feature_names[2:],
    class_names = iris.target_names,
    rounded = True,
    filled = True
)

Source.from_file(os.path.join("./", "iris_tree.dot"))
'''

' \nimport os\nfrom graphviz import Source\nfrom sklearn.tree import export_graphviz\n\nexport_graphviz(\n    tree_clf,\n    out_file = os.path.join("./", "iris_tree.dot"),\n    feature_names = iris.feature_names[2:],\n    class_names = iris.target_names,\n    rounded = True,\n    filled = True\n)\n\nSource.from_file(os.path.join("./", "iris_tree.dot"))\n'

## 앙상블

### hard voting 분류기

In [17]:
# 데이터 홀드아웃
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True)


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)

vot_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting = 'hard' # hard or soft ? 
)

vot_clf.fit(X_train, y_train)


# 학습 결과 확인
from sklearn.metrics import accuracy_score

for clf in ( log_clf, rnd_clf, svm_clf, vot_clf ) :
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    # 보통은... 앙상블이 확률 더 높음 ^^ ... 

LogisticRegression 0.9666666666666667
RandomForestClassifier 0.9666666666666667
SVC 0.9666666666666667
VotingClassifier 0.9666666666666667


### bagging & pasting

In [21]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples = 50,
    bootstrap = True, # 중복 허용
    oob_score = True, # oob score 자동 평가
    random_state = 42
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(bag_clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.9666666666666667


### 랜덤 포레스트 RandomForest

In [23]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(
    n_estimators = 500,
    max_leaf_nodes = 16,
    random_state = 42
)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

### Extra Trees

In [24]:
from sklearn.ensemble import ExtraTreesClassifier

rnd_clf = ExtraTreesClassifier(
    n_estimators = 500,
    max_leaf_nodes = 16,
    random_state = 42
)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

### 특성 중요도

In [25]:
# iris 데이터 셋에 랜덤포레스트 훈련 후 각 특성의 중요도를 출력하는 코드
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(
    n_estimators = 500,
    random_state = 42
)
rnd_clf.fit( iris["data"], iris["target"] )
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_) :
    print(name, score) # 가장 중요한 특성 = 꽃잎의 길이와 너비

sepal length (cm) 0.11249225099876375
sepal width (cm) 0.02311928828251033
petal length (cm) 0.4410304643639577
petal width (cm) 0.4233579963547682


### 에이다 부스트

In [26]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators = 200, # 200개의 아주 얕은 결정 트리를 기반으로 ... 
    algorithm = "SAMME.R",
    learning_rate = 0.5,
    random_state = 42
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200, random_state=42)

### 그레디언트 부스팅

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor( 
    max_depth = 2,
    n_estimators = 3,
    learning_rate = 1.0,
    random_state = 42
)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3,
                          random_state=42)

### GBRT 최적 트리 수 찾기 : staged_predict()

In [29]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

gbrt = GradientBoostingRegressor( 
    max_depth = 2,
    n_estimators = 120,
    random_state = 42
)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_test, y_pred) 
          for y_pred in gbrt.staged_predict(X_test)]

bst_n_estimators = np.argmin(errors) + 1
gbrt_best = GradientBoostingRegressor(
    max_depth = 2,
    n_estimators = bst_n_estimators,
    random_state = 42
)
gbrt.fit(X_train, y_train)


GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)

### GBRT 최적 트리 수 찾기 : warm_start = True로 실제 훈련 중지 

In [None]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)

min_val_error = float("inf")
error_going_up = 0

for n_estimators in range(1, 120) :
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)
    
    if val_error < min_val_error : 
        min_val_error = val_error 
        error_going_up = 0
    else : 
        error_going_up += 1
        if error_going_up == 5 :
            break # early end 