# 앙상블 학습(Ensemble Learning)
- 여러 개의 분류기를 생성하고 예측을 결합하여 보다 정확한 최종 예측을 도출하는 기법
    - 보팅(Voting)
    - 배깅(Bagging)
    - 부스팅(Boosting)

## 부스팅(Boosting)
- 여러 개의 약한 학습기를 순차적으로 학습/예측하면서 잘못 예측한 데이터에 가중치 부여를 통해 오류를 개선해 나가면서 학습하는 방식

# GBM(Gradient Boosting Machine)
- 경사 하강법(Gradient Descent)으로 가중치 업데이트

In [4]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score


def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x: x[0] + '_' + str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

def get_human_dataset():
    feature_name_df = pd.read_csv(r'C:\Users\arceu\Desktop\Machine-Learning\data\uci\features.txt', sep='\s+', header=None, names = ['colmun_index', 'column_name'])
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    feature_name = new_feature_name_df.iloc[:,1].values.tolist()
    X_train = pd.read_csv(r'C:\Users\arceu\Desktop\Machine-Learning\data\uci\train\X_train.txt', sep='\s+', header=None, names=feature_name)
    X_test = pd.read_csv(r'C:\Users\arceu\Desktop\Machine-Learning\data\uci\test\X_test.txt', sep='\s+', names=feature_name)
    y_train = pd.read_csv(r'C:\Users\arceu\Desktop\Machine-Learning\data\uci\train\y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv(r'C:\Users\arceu\Desktop\Machine-Learning\data\uci\test\y_test.txt', sep='\s+', header=None, names=['action'])
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = get_human_dataset()

clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('Score of GBM: ', accuracy)

Score of GBM:  0.9382422802850356


### Random Forest 보다도 결과가 좋은 것을 확인할 수 있다. 하이퍼 파라미터 튜닝을 통하여 정확도를 조금만 올려보자

In [8]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 500],
    'learning_rate': [0.05, 0.1]
}

grid_cv = GridSearchCV(clf, param_grid=params, cv=2)
grid_cv.fit(X_train, y_train)
print(grid_cv.best_params_)
print(grid_cv.best_score_)

{'learning_rate': 0.05, 'n_estimators': 500}
0.9009793253536452


In [10]:
pred = grid_cv.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('Score of GBM: ', accuracy)

Score of GBM:  0.9385816084153377
