#### sklearn.ensemble.GradientBoostingClassifier

* class sklearn.ensemble.GradientBoostingClassifier(*, loss='log_loss', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

In [13]:
from sklearn.metrics import accuracy_score
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
warnings.filterwarnings('ignore')

# features.txt 파일에는 피처 이름 index와 피처명이 공백으로 분리되어 있음. 이를 DataFrame으로 로드.
feature_name_df = pd.read_csv('./datasets/human_activity/features.txt',sep='\s+', header=None,
                             names=['column_index', 'column_name'])
# feature_name_df
# # 피처명 index를 제거하고, 피처명만 리스트 객체로 생성한 뒤 샘플로 10개만 추출
feature_name = feature_name_df.iloc[:, 1].values.tolist()
print('전체 피처명에서 10개만 추출:', feature_name[:10])

전체 피처명에서 10개만 추출: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X']


In [14]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(),
                                 columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : 
                                            x[0]+'-'+str(x[1])if x[1] >0 else x[0] , axis=1)
    
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

In [15]:
def get_human_dataset():
    # 각 데이터 파일들은 공백으로 분리되어 있으므로 read_csv에서 공백 문자를 sep으로 할당.
    feature_name_df = pd.read_csv('./datasets/human_activity/features.txt', sep='\s+',
                                 header=None,names=['column_index','column_name'])
    
    # 중복된 피처명을 수정하는 get_new_feature_name_df()를 이용, 신규 피처명 DataFrame생성.
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    
    #DataFrame에 피처명을 컬럼으로 부여하기 위해 리스트 객체로 다시 변환
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    #학습 피처 데이터 셋과 테스트 피처 데이터를 DataFrame으로 로딩. 컬럼명은 feature_name으로 적용
    X_train = pd.read_csv('./datasets/human_activity/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('./datasets/human_activity/X_test.txt', sep='\s+' , names=feature_name)
    
    # 학습 레이블과 테스트 레이블 데이터를 DataFrame으로 로딩하고 컬럼명은 action으로 부여.
    y_train = pd.read_csv('./datasets/human_activity/y_train.txt' , sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('./datasets/human_activity/y_test.txt', sep='\s+', header=None, names=['action'])

    # 로드된 학습/테스트용 DataFrame을 모두 반환
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_human_dataset()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import time
import warnings


warnings.filterwarnings('ignore')
X_train, X_test, y_train, y_test = get_human_dataset()
start_time = time.time()
gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train,y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print(gb_accuracy)
print(time.time() - start_time)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
pd.Series(y_train)


y_test = le.fit_transform(y_test)
pd.Series(y_test)



In [None]:
from xgboost import XGBClassifier

# 모델 선언 예시
model = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=4, random_state = 32)

model.fit(X_train, y_train)

y_pred = model.predict(X_test) # 예측 라벨(0과 1로 예측)

# 예측 라벨과 실제 라벨 사이의 정확도 측정
accuracy_score(y_pred, y_test) # 0.7847533632286996

In [None]:
# 코딩으로 경사하강법 구현

import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
np.random.seed(0)

# y = 4X +6 식을 근사
X = 2*np.random.rand(100,1)
y = 6 + 4*X + np.random.randn(100,1)

print(len(X))
X


In [None]:
print(len(y))
y


In [None]:
plt.scatter(X, y)

In [None]:
def get_weight_updates(w1, w0, X, y, learning_rate=0.01):
    N = len(y)
    w1_update = np.zeros_like(w1)
    w0_update = np.zeros_like(w0)
    y_pred = np.dot(X, wl.T) + w0
    diff = y-y_pred
    
    w0_factors = np.ones((N,1))
    W1_update = -(2/N)*learning_rate*(np.dot(X.T,diff))
    w0_update = -(2/N)*learning_rate*(np.dot(W0_factors.T, diff))
    
    retrun w1_update, w0_update
    