# XGboost

In [1]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split

### 설정된 값들

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

# 훈련 데이터 가져오는 부분
train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE,'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE,'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE,'rb'))

In [3]:
train_input = np.stack((train_q1_data, train_q2_data), axis = 1)

In [19]:
print(train_q1_data.shape)
print(train_q1_data[0])

(298526, 31)
[   3  638 1003    6   27    7  110   65  192 7808 2540 1461   12 6382
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0]


In [17]:
print(train_input.shape)
print(train_input[0])

(298526, 2, 31)
[[   3  638 1003    6   27    7  110   65  192 7808 2540 1461   12 6382
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [  58   30    5  438   14    7  110   65  192 7808 2540 1461   12 6382
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]]


### 훈련 셋과 평가 셋 나누기¶

In [20]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_labels, test_size=0.2, random_state=4242)

In [24]:
import xgboost as xgb

### 모델 구성

In [30]:
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label) # 학습 데이터 읽어오기
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label= eval_label) # 평가 데이터 읽어오기

data_list = [(train_data, 'train'), (eval_data, 'valid')]

In [33]:
params = {} # 인자를 통해 XGB 모델에 넣어 주자
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서
params['eval_metric'] = 'rmse' #root mean square error(평균 제곱 오차)를 사용

bst = xgb.train(params, train_data, num_boost_round = 1000, evals = data_list, early_stopping_rounds=10)

[0]	train-rmse:0.48423	valid-rmse:0.48449
[1]	train-rmse:0.47389	valid-rmse:0.47457
[2]	train-rmse:0.46707	valid-rmse:0.46802
[3]	train-rmse:0.46255	valid-rmse:0.46381
[4]	train-rmse:0.45928	valid-rmse:0.46076
[5]	train-rmse:0.45673	valid-rmse:0.45834
[6]	train-rmse:0.45379	valid-rmse:0.45556
[7]	train-rmse:0.45220	valid-rmse:0.45423
[8]	train-rmse:0.45096	valid-rmse:0.45317
[9]	train-rmse:0.44952	valid-rmse:0.45197
[10]	train-rmse:0.44870	valid-rmse:0.45132
[11]	train-rmse:0.44792	valid-rmse:0.45068
[12]	train-rmse:0.44620	valid-rmse:0.44909
[13]	train-rmse:0.44501	valid-rmse:0.44800
[14]	train-rmse:0.44361	valid-rmse:0.44676
[15]	train-rmse:0.44324	valid-rmse:0.44654
[16]	train-rmse:0.44287	valid-rmse:0.44629
[17]	train-rmse:0.44248	valid-rmse:0.44607
[18]	train-rmse:0.44223	valid-rmse:0.44595
[19]	train-rmse:0.44176	valid-rmse:0.44565
[20]	train-rmse:0.44141	valid-rmse:0.44539
[21]	train-rmse:0.44036	valid-rmse:0.44459
[22]	train-rmse:0.43945	valid-rmse:0.44396
[23]	train-rmse:0.439

### 테스트 데이터 가져오기

In [37]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'), allow_pickle=True)
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'), allow_pickle=True)
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'), allow_pickle=True)

### 예측하기

In [38]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1) 
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [40]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate': test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)