In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## RandomForest classifier

In [14]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=0)

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 

In [56]:
from sklearn.ensemble import RandomForestClassifier

# n_estimators : The number of trees
# n_estimators가 많을수록 안정된 모델. 100~200 -> enough
forest = RandomForestClassifier(n_estimators = 5, random_state = 0)
forest.fit(X_train,y_train)

RandomForestClassifier(n_estimators=5, random_state=0)

In [58]:
y_train_hat = forest.predict(X_test)

In [59]:
forest.estimators_[0].predict(X_test)

array([1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1.,
       1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 0., 1., 0., 1., 0.])

In [60]:
forest.estimators_

[DecisionTreeClassifier(max_features='auto', random_state=209652396),
 DecisionTreeClassifier(max_features='auto', random_state=398764591),
 DecisionTreeClassifier(max_features='auto', random_state=924231285),
 DecisionTreeClassifier(max_features='auto', random_state=1478610112),
 DecisionTreeClassifier(max_features='auto', random_state=441365315)]

In [61]:
forest.feature_importances_

array([1.70331879e-02, 2.90774085e-02, 4.73340018e-02, 4.75465100e-03,
       1.91909725e-03, 2.11078205e-05, 1.91131044e-01, 1.52276765e-01,
       0.00000000e+00, 4.95938490e-03, 2.80632490e-03, 1.06793039e-02,
       2.88125252e-03, 1.03078322e-01, 9.02256142e-03, 3.77738940e-03,
       8.03167421e-03, 2.57435600e-03, 1.01780260e-02, 3.85579773e-03,
       2.40653064e-02, 9.28120486e-03, 1.65005548e-01, 0.00000000e+00,
       5.19323647e-03, 4.29767190e-04, 1.23300192e-02, 1.62722820e-01,
       9.96945894e-03, 5.61098338e-03])

## Build random forest with Decision tree classifier

#### training phase

In [71]:
from sklearn.tree import DecisionTreeClassifier

n_estimators=100

# 100 randomized trees가 들어갈 예정
random_trees=[]
for i in range(n_estimators):
    '''TODO - bootstrap sampling (You can use 'np.random.choice')
    '''   
    idx = np.random.choice(X_train.shape[0],X_train.shape[0], replace=True)

#     boostrap sampling
    X_train_base = X_train[idx, :]
    y_train_base = y_train[idx]
    
#     train random tree
    rt = DecisionTreeClassifier(max_features = 'sqrt')
    rt.fit(X_train_base, y_train_base)
    
    '''TODO - train random tree (Tune the hyperparameters 'max_features' of DecisionTreeClassifier)
    '''
    # store the models in the list
    random_trees.append(rt)

#### test phase

In [74]:
y_test_hats=[]
for i in range(n_estimators):
    '''TODO - get y_test_hat using each random tree and store them

    '''
    y_test_hats.append(random_trees[i].predict(X_test))
    
y_test_hats = np.stack(y_test_hats).T

In [75]:
y_test_hats.shape

(143, 100)

In [76]:
# voting    
from scipy import stats 
'''TODO - use 'stats.mode' function to get the result of majority voting
'''

# mode : 최빈값, axis = 1 : row 방향으로
y_test_hat_voted = stats.mode(y_test_hats, axis=1)[0].squeeze()

In [79]:
y_test_hat_voted

array([1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0])

In [80]:
accuracy_score(y_test, y_test_hat_voted)

0.951048951048951

In [82]:
for i in range(n_estimators):
    print(accuracy_score(y_test,y_test_hats[:,i]))
    
# 결론 : Higher accuracy

0.8811188811188811
0.9090909090909091
0.8881118881118881
0.9090909090909091
0.9370629370629371
0.916083916083916
0.9440559440559441
0.9090909090909091
0.916083916083916
0.9230769230769231
0.916083916083916
0.8951048951048951
0.9090909090909091
0.9020979020979021
0.8951048951048951
0.916083916083916
0.8951048951048951
0.916083916083916
0.916083916083916
0.9370629370629371
0.9090909090909091
0.8951048951048951
0.9300699300699301
0.9440559440559441
0.8741258741258742
0.9090909090909091
0.9090909090909091
0.8951048951048951
0.9090909090909091
0.9230769230769231
0.9230769230769231
0.9230769230769231
0.9230769230769231
0.9020979020979021
0.9370629370629371
0.8881118881118881
0.9230769230769231
0.9230769230769231
0.9090909090909091
0.9440559440559441
0.8951048951048951
0.9090909090909091
0.8881118881118881
0.9020979020979021
0.9230769230769231
0.9090909090909091
0.9230769230769231
0.9090909090909091
0.9020979020979021
0.916083916083916
0.951048951048951
0.8881118881118881
0.8531468531468531
0

### **Hard voting** (Majority voting) vs **Soft voting** (Averaging probabilities)

### 2가지 타입 of voting

**Hard voting** : Predict class label as the class that represents the majority (mode) of the class labels predicted by each individual classifier.    
- 우리가 이미 사용한 것(위의 코드)


**Soft voting** : Predict class label using the **averaging probabilities** provided by each individual classifier.
- 확률 사용

In [86]:
y_test_probs=[]
for i in range(n_estimators):
    '''TODO - get y_test_prob using each random tree and store them (".predict_proba") 
    '''
#     .predict_proba를 .predict대신 사용하였음
    y_test_prob = random_trees[i].predict_proba(X_test)
    y_test_probs.append(y_test_prob)
    
# y_test_probs : 2column의 결과였는데 stack 사용
y_test_probs = np.stack(y_test_probs)

In [85]:
y_test_probs

array([[[0., 1.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]],

       [[0., 1.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]],

       [[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]],

       ...,

       [[0., 1.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]],

       [[0., 1.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]],

       [[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]]])

In [88]:
# soft voting    
'''TODO - calculate average probabilities 
'''

y_test_probs_mean = y_test_probs.mean(axis = 0)

'''TODO - get 'y_test_hat' using the probabilities (you can use 'np.argmax')
    
    
'''

y_test_hat = y_test_probs_mean.argmax(axis = 1)

In [89]:
accuracy_score(y_test, y_test_hat)

0.951048951048951