Случайный лес: ансамбли деревьев = бэггинг фичей и объектов + блендинг

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

![ensemble_idea](forrest_idea.png)

In [None]:
X, y = load_digits(n_class=10, return_X_y=True)

In [None]:
X.shape

## 1. Случайные подмножества признаков

In [None]:
predict_proba_models = []

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=1)

for state in range(5):
    
    model = DecisionTreeClassifier(max_features=4,
                                   max_depth=2,
                                   random_state=state)

    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    predict_proba_models.append(y_pred)

    y_pred = model.predict(X_test)

    print('Точность классификатора: {:.3f}'.format(accuracy_score(y_test, y_pred)))
    print('Признаки по которым проходило разделение: {}'.format(np.nonzero(model.feature_importances_)))
    print('\n-------\n')

In [None]:
predict_proba_models = np.array(predict_proba_models)

print(predict_proba_models.shape)

mean_predict_proba = predict_proba_models.sum(axis=0) / 5
mean_predict = np.argmax(mean_predict_proba, axis=1)

print(accuracy_score(y_test, mean_predict))

## 2. Случайные подмножества объектов

In [None]:
predict_proba_models = []


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=1)

def bootstrap_indices(random_state, n_samples):
    """Private function used to _parallel_build_trees function."""
    random_instance = np.random.RandomState(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)

    return sample_indices

for state in range(5):
   
    ind = bootstrap_indices(state, X_train.shape[0])
    X_train_, y_train_ = X_train[ind], y_train[ind]
    
    model = DecisionTreeClassifier(max_features=4,
                                   max_depth=2,
                                   random_state=2)

    model.fit(X_train_, y_train_)
    y_pred = model.predict_proba(X_test)
    
    predict_proba_models.append(y_pred)
    
    y_pred = model.predict(X_test)
    print('Точность классификатора: {:.3f}'.format(accuracy_score(y_test, y_pred)))
    print('Признаки по которым проходило разделение: {}'.format(np.nonzero(model.feature_importances_)))
    print('\n-------\n')

In [None]:
predict_proba_models = np.array(predict_proba_models)

print(predict_proba_models.shape)

mean_predict_proba = predict_proba_models.sum(axis=0) / 5
mean_predict = np.argmax(mean_predict_proba, axis=1)

print(accuracy_score(y_test, mean_predict))

## 3. Случайные признаки + Сэмплинг данных

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=1)

#-------------------------------------------------------------------------------------------
# TRAINING

models = []
rs = list(range(5))

for state in rs:
    
    ind = bootstrap_indices(state, X_train.shape[0])
    X_train_, y_train_ = X_train[ind], y_train[ind]
    
    model = DecisionTreeClassifier(max_features=32,
                                   splitter = 'best',
                                   max_depth=4,
                                   random_state=state+10)

    model.fit(X_train_, y_train_)    
    models.append(model)

    
#-------------------------------------------------------------------------------------------
# PREDICTION

predict_proba_models = []


for model in models:

    y_pred = model.predict_proba(X_test)
    predict_proba_models.append(y_pred)
    y_pred = model.predict(X_test)

    print('Точность классификатора: {:.3f}'.format(accuracy_score(y_test, y_pred)))
    
    
predict_proba_models = np.array(predict_proba_models)

print(predict_proba_models.shape)

mean_predict_proba = predict_proba_models.sum(axis=0) / 5
mean_predict = np.argmax(mean_predict_proba, axis=1)

print(accuracy_score(y_test, mean_predict))

## 4. Случайный лес 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

model = RandomForestClassifier(n_estimators=5,
                               max_features=32,
                               max_depth=4,
                               random_state=1) 

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))

In [None]:
rs = []
for m in model.estimators_:
    rs.append(m.random_state)

In [None]:
rs

In [None]:
model.estimators_

In [None]:
# from sklearn.tree import export_graphviz

# for est,i in zip(model.estimators_,range(5)):
#     export_graphviz(est, out_file='tree{}.dot'.format(i), filled=True)

In [None]:
# !dot -Tpng 'tree0.dot' -o 'tree0.png'

### 4.1. Параметры Случайного леса: число деревьев

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
acc_train = []
acc_test = []

for n in tqdm_notebook(range(1,100,2)):
    
    model = RandomForestClassifier(n_estimators=n,
                                   max_features=4,
                                   max_depth=2,
                                   random_state=1, n_jobs=-1) 
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    acc_train.append(accuracy_score(y_train, model.predict(X_train)))
    acc_test.append(accuracy_score(y_test, y_pred))

In [None]:
plt.rc('font', **{'size':20})

plt.figure(figsize=(10,5))
plt.plot(list(range(1,100,2)), 1-np.array(acc_train), c='r', label='Train error')
plt.plot(list(range(1,100,2)), 1-np.array(acc_test), c='b', label='Test error')
plt.xlabel('Число деревьев')
plt.ylabel('Ошибка предсказания')
plt.legend();

### 4.2. Параметры Случайного леса: глубина дерева

In [None]:
acc_train = []
acc_test = []

for n in tqdm_notebook(range(1,20,1)):
    
    model = RandomForestClassifier(n_estimators=5,
                                   max_features=4,
                                   max_depth=n,
                                   random_state=1, n_jobs=-1) 
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    acc_train.append(accuracy_score(y_train, model.predict(X_train)))
    acc_test.append(accuracy_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(10,5))
plt.plot(list(range(1,20,1)), 1-np.array(acc_train), c='r', label='Train error')
plt.plot(list(range(1,20,1)), 1-np.array(acc_test), c='b', label='Test error')
plt.xlabel('Глубина дерева (Tree depth)')
plt.ylabel('Ошибка предсказания')
plt.legend();

### 4.3. Параметры Случайного леса: количество признаков

In [None]:
acc_train = []
acc_test = []

for n in tqdm_notebook(range(1,64)):
    
    model = RandomForestClassifier(n_estimators=15,
                                   max_features=n,
                                   max_depth=2,
                                   random_state=1, n_jobs=-1) 
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    acc_train.append(accuracy_score(y_train, model.predict(X_train)))
    acc_test.append(accuracy_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(10,5))
plt.plot(list(range(1,64)), 1-np.array(acc_train), c='r', label='Train error')
plt.plot(list(range(1,64)), 1-np.array(acc_test), c='b', label='Test error')

plt.legend();

## Деревья на случайных признаках

выбор признака в узле с вероятностью, зачем?

Ответ- помогает сильно скорелированным признакам
получить +- одинаковый вес (если брать аргмакс то какой то признак может сильно просесть)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

model = ExtraTreesClassifier(n_estimators=5, max_features=4, max_depth=2, random_state=1, bootstrap=True) 

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))