# XGboost
- 부스팅
    - 앙상블 기법 중 하나로 여러개의 알고리즘을 학습, 각 결과를 순차적으로 취합하는데, 단순히 하나씩 취합하는 방법이 아니라 이전 알고리즘, 모델이 학습 후 잘못 예측한 부분에 가중치를 줘서 다시 모델로 가서 학습하는 방식
- 트리 부스팅
    - 여러개의 의사결정 트리를 사용하지만 단순히 결과를 평균 내는 것이 아니라 결과를 보고 오답에 대해 가중치를 부여함
    - 그리고 가중치가 적용된 오답에 대해서는 관심을 가지고 정답이 될 수 있도록 결과를 만들고 해당 결과에 대한 다른 오답을 찾아 다시 똑같은 작업을 반복적으로 수행함
- XGboost
    - 트리 부스팅 방식에 경사하강법을 통해 최적화하는 방법.
    - 의사결정 트리를 구성할때 병렬처리를 통해 빠른 시간에 학습이 가능함

In [1]:
import pandas as pd
import numpy as np
import os

import json

from sklearn.model_selection  import train_test_split

### 설정된 값들 & 전처리된 데이터 로드

In [3]:
DATA_IN_PATH = '../dataset/quora/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

In [4]:
# 훈련 데이터 가져오는 부분이다.
train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb' ))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb' ))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb' ))


In [5]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1) 

In [8]:
train_input[0]

array([[   3,   44,    7,   70,    8,  608,   48,    1, 1732, 2174, 8711,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  16,   21, 2174,  608,  130,  550,   45,    5,  744,  113,    1,
        2174,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [9]:
train_input.shape

(298526, 2, 31)

### 훈련 셋과 평가 셋 나누기

In [10]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, 
                                                                    train_labels, 
                                                                    test_size=0.2, 
                                                                    random_state=4242)



In [13]:
import xgboost as xgb

### 모델 구성

In [18]:
# DMatrix 형태로 만들기
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label) # 학습 데이터 읽어 오기
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label) # 평가 데이터 읽어 오기


In [17]:
train_input.sum(axis=1).shape

(238820, 31)

In [20]:
data_list = [(train_data, 'train'), (eval_data, 'valid')]

In [21]:
data_list

[(<xgboost.core.DMatrix at 0x1a2b9919b0>, 'train'),
 (<xgboost.core.DMatrix at 0x1a2b991978>, 'valid')]

In [22]:
params = {} # 인자를 통해 XGB모델에 넣어 주자 
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서 
params['eval_metric'] = 'rmse' # root mean square error를 사용

bst = xgb.train(params, 
                train_data, 
                num_boost_round = 1000, 
                evals = data_list, 
                early_stopping_rounds=10)


[0]	train-rmse:0.483681	valid-rmse:0.484118
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[1]	train-rmse:0.473433	valid-rmse:0.474263
[2]	train-rmse:0.466983	valid-rmse:0.468293
[3]	train-rmse:0.462514	valid-rmse:0.4639
[4]	train-rmse:0.458652	valid-rmse:0.460363
[5]	train-rmse:0.455964	valid-rmse:0.45786
[6]	train-rmse:0.453817	valid-rmse:0.455901
[7]	train-rmse:0.451922	valid-rmse:0.454147
[8]	train-rmse:0.449551	valid-rmse:0.451959
[9]	train-rmse:0.448466	valid-rmse:0.451044
[10]	train-rmse:0.446874	valid-rmse:0.449566
[11]	train-rmse:0.445932	valid-rmse:0.44878
[12]	train-rmse:0.445111	valid-rmse:0.448133
[13]	train-rmse:0.44365	valid-rmse:0.446897
[14]	train-rmse:0.442939	valid-rmse:0.446397
[15]	train-rmse:0.44124	valid-rmse:0.444814
[16]	train-rmse:0.440675	valid-rmse:0.444313
[17]	train-rmse:0.440265	valid-rmse:0.443979
[18]	train-rmse:0.439645	valid-rmse:0.443501
[19]	train-rmse:

[179]	train-rmse:0.40309	valid-rmse:0.424339
[180]	train-rmse:0.402943	valid-rmse:0.424279
[181]	train-rmse:0.40261	valid-rmse:0.424074
[182]	train-rmse:0.402488	valid-rmse:0.424032
[183]	train-rmse:0.402311	valid-rmse:0.423975
[184]	train-rmse:0.402164	valid-rmse:0.423949
[185]	train-rmse:0.401972	valid-rmse:0.423931
[186]	train-rmse:0.401858	valid-rmse:0.42392
[187]	train-rmse:0.401723	valid-rmse:0.423884
[188]	train-rmse:0.40149	valid-rmse:0.423831
[189]	train-rmse:0.40143	valid-rmse:0.423824
[190]	train-rmse:0.401387	valid-rmse:0.423816
[191]	train-rmse:0.401266	valid-rmse:0.4238
[192]	train-rmse:0.401242	valid-rmse:0.423795
[193]	train-rmse:0.401235	valid-rmse:0.423792
[194]	train-rmse:0.401184	valid-rmse:0.423766
[195]	train-rmse:0.40091	valid-rmse:0.42367
[196]	train-rmse:0.40089	valid-rmse:0.423656
[197]	train-rmse:0.400672	valid-rmse:0.423614
[198]	train-rmse:0.400538	valid-rmse:0.423619
[199]	train-rmse:0.400341	valid-rmse:0.423542
[200]	train-rmse:0.400104	valid-rmse:0.42349

[359]	train-rmse:0.379994	valid-rmse:0.419497
[360]	train-rmse:0.379793	valid-rmse:0.419506
[361]	train-rmse:0.379573	valid-rmse:0.419462
[362]	train-rmse:0.379429	valid-rmse:0.419466
[363]	train-rmse:0.379332	valid-rmse:0.419465
[364]	train-rmse:0.379169	valid-rmse:0.419488
[365]	train-rmse:0.379076	valid-rmse:0.419483
[366]	train-rmse:0.378843	valid-rmse:0.419414
[367]	train-rmse:0.378785	valid-rmse:0.419407
[368]	train-rmse:0.378697	valid-rmse:0.419404
[369]	train-rmse:0.378692	valid-rmse:0.419399
[370]	train-rmse:0.378683	valid-rmse:0.419397
[371]	train-rmse:0.378662	valid-rmse:0.419386
[372]	train-rmse:0.37865	valid-rmse:0.419372
[373]	train-rmse:0.378639	valid-rmse:0.419366
[374]	train-rmse:0.378504	valid-rmse:0.419368
[375]	train-rmse:0.378432	valid-rmse:0.419381
[376]	train-rmse:0.378429	valid-rmse:0.41938
[377]	train-rmse:0.378222	valid-rmse:0.419375
[378]	train-rmse:0.378051	valid-rmse:0.419333
[379]	train-rmse:0.37787	valid-rmse:0.419284
[380]	train-rmse:0.377684	valid-rmse:

[538]	train-rmse:0.360193	valid-rmse:0.416666
[539]	train-rmse:0.360118	valid-rmse:0.416678
[540]	train-rmse:0.359977	valid-rmse:0.416706
[541]	train-rmse:0.35982	valid-rmse:0.416683
[542]	train-rmse:0.35971	valid-rmse:0.416695
[543]	train-rmse:0.359643	valid-rmse:0.416703
[544]	train-rmse:0.35957	valid-rmse:0.416692
Stopping. Best iteration:
[534]	train-rmse:0.360431	valid-rmse:0.41664



### 테스트 데이터 가져오기

In [35]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb' ))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
# test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))
test_id_data = np.arange(2345796)

### 예측하기

In [30]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1) 
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [36]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate': test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)

In [37]:
test_input.shape

(2345796, 2, 31)

In [38]:
output.shape

(2345796, 2)