# 7장. 랜덤 포레스트

## 7.1 투표 기반 분류기

#### Moon dataset

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42) # 데이터 생성
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Train, Test 데이터 분류

#### Logistic Regression, Random Forest, SVM 직접투표 분류기

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs", random_state=42) # solver란 최적화 문제를 푸는데 사용하는 알고리즘, default='lbfgs' handles multinomial loss
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42) # gamma란 커널 계수로 하나의 훈련 샘플에 미치는 영향의 범위를 결정, default='scale' uses 1 / (n_features * X.var()) as value of gamma

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [3]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

#### 각 분류기 성능 확인

In [4]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912


#### Logistic Regression, Random Forest, SVM 간접투표 분류기

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42, probability=True)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')

In [6]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(probability=True, random_state=42))],
                 voting='soft')

#### 각 분류기 성능 확인

In [7]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


## 7.2 배깅과 페이스팅

#### 배깅

In [8]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [9]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.904


In [10]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))

0.856


#### 페이스팅

In [11]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=False, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.92


#### oob 평가

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=1, oob_score=True) #  n_jobs 매개변수를 이용하여 사용할 코어 수를 지정 가능, 사용하는 CPU 코어 개수에 비례해서 속도도 빨라짐
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.92

In [14]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.904


In [15]:
bag_clf.oob_decision_function_

array([[0.37563452, 0.62436548],
       [0.37903226, 0.62096774],
       [1.        , 0.        ],
       [0.008     , 0.992     ],
       [0.01574803, 0.98425197],
       [0.11842105, 0.88157895],
       [0.3814433 , 0.6185567 ],
       [0.06806283, 0.93193717],
       [0.94344473, 0.05655527],
       [0.83838384, 0.16161616],
       [0.49869452, 0.50130548],
       [0.04615385, 0.95384615],
       [0.74338624, 0.25661376],
       [0.845953  , 0.154047  ],
       [0.93646409, 0.06353591],
       [0.06297229, 0.93702771],
       [0.02368421, 0.97631579],
       [0.92030848, 0.07969152],
       [0.67292225, 0.32707775],
       [0.93861893, 0.06138107],
       [0.0475    , 0.9525    ],
       [0.23796791, 0.76203209],
       [0.85789474, 0.14210526],
       [0.99487179, 0.00512821],
       [0.96286472, 0.03713528],
       [0.        , 1.        ],
       [0.94750656, 0.05249344],
       [0.99738903, 0.00261097],
       [0.02864583, 0.97135417],
       [0.76081425, 0.23918575],
       [0.

## 7.4 랜덤 포레스트

#### 랜덤 포레스트

In [16]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1) # n_jobs=-1로 지정하면 컴퓨터의 모든 코어를 사용
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

#### 특성 중요도

In [17]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09315427892870644
sepal width (cm) 0.023255050143990968
petal length (cm) 0.41271895472588577
petal width (cm) 0.4708717162014168


## 7.5 부스팅

#### 에이다 부스트

In [18]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42) # 'SAMME.R': real boosting algorithm,‘SAMME': discrete boosting algorithm, 'SAMME.R'이 'SAMME'보다 일반적으로 빠르게 수렴
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200, random_state=42)