# XGboost

In [1]:
import pandas as pd
import numpy as np
import os

import json

from sklearn.model_selection  import train_test_split

### 설정된 값들

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'


# 훈련 데이터 가져오는 부분이다.
train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [3]:
# 두 질문을 질문쌍으로 묶는다
train_input = np.stack((train_q1_data, train_q2_data), axis=1) 

In [4]:
print(train_input.shape)

(298526, 2, 31)


### 훈련 셋과 평가 셋 나누기¶

In [5]:
# 학습 데이터와 평가 데이터로 나눈다
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_labels, test_size=0.2, random_state=4242)

In [6]:
# xg부스트를 불러온다
import xgboost as xgb

### 모델 구성

In [7]:
# 평가데이터, 학습 데이터를 xgb라이브러리의 데이터 형식인 DMatrix 형태로 만든다.
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label) # 학습 데이터 읽어 오기
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label) # 평가 데이터 읽어 오기

# 학습데이터, 평가데이터를 각 상태의 문자열과 함께 튜플 형태로 구성한다.
data_list = [(train_data, 'train'), (eval_data, 'valid')]

In [8]:
# 모델 생성

params = {} # 인자를 통해 XGB모델에 넣어 주자 
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서 
params['eval_metric'] = 'rmse' # root mean square error를 사용  

bst = xgb.train(params, train_data, num_boost_round = 1000, evals = data_list, early_stopping_rounds=10)

# num_boost_round: 데이터를 반복하는 횟수
# early_stopping_rounds: 조기 멈춤을 위한 횟수값(10 에폭동안 에러값이 별로 줄어들지 않았을 경우 학습을 조기에 멈추게 함)

[0]	train-rmse:0.48380	valid-rmse:0.48426
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[1]	train-rmse:0.47353	valid-rmse:0.47437
[2]	train-rmse:0.46706	valid-rmse:0.46841
[3]	train-rmse:0.46185	valid-rmse:0.46353
[4]	train-rmse:0.45859	valid-rmse:0.46054
[5]	train-rmse:0.45584	valid-rmse:0.45806
[6]	train-rmse:0.45367	valid-rmse:0.45605
[7]	train-rmse:0.45105	valid-rmse:0.45359
[8]	train-rmse:0.44967	valid-rmse:0.45240
[9]	train-rmse:0.44834	valid-rmse:0.45125
[10]	train-rmse:0.44739	valid-rmse:0.45047
[11]	train-rmse:0.44521	valid-rmse:0.44841
[12]	train-rmse:0.44443	valid-rmse:0.44774
[13]	train-rmse:0.44366	valid-rmse:0.44716
[14]	train-rmse:0.44260	valid-rmse:0.44616
[15]	train-rmse:0.44173	valid-rmse:0.44548
[16]	train-rmse:0.44135	valid-rmse:0.44520
[17]	train-rmse:0.44099	valid-rmse:0.44493
[18]	train-rmse:0.44062	valid-rmse:0.44468
[19]	train-rmse:0.43883	valid-rmse:0.44307
[20]	

[186]	train-rmse:0.40085	valid-rmse:0.42457
[187]	train-rmse:0.40066	valid-rmse:0.42454
[188]	train-rmse:0.40062	valid-rmse:0.42453
[189]	train-rmse:0.40058	valid-rmse:0.42452
[190]	train-rmse:0.40049	valid-rmse:0.42451
[191]	train-rmse:0.40040	valid-rmse:0.42448
[192]	train-rmse:0.40025	valid-rmse:0.42439
[193]	train-rmse:0.40006	valid-rmse:0.42436
[194]	train-rmse:0.39988	valid-rmse:0.42431
[195]	train-rmse:0.39967	valid-rmse:0.42426
[196]	train-rmse:0.39958	valid-rmse:0.42425
[197]	train-rmse:0.39957	valid-rmse:0.42424
[198]	train-rmse:0.39956	valid-rmse:0.42424
[199]	train-rmse:0.39939	valid-rmse:0.42421
[200]	train-rmse:0.39929	valid-rmse:0.42420
[201]	train-rmse:0.39925	valid-rmse:0.42419
[202]	train-rmse:0.39925	valid-rmse:0.42420
[203]	train-rmse:0.39919	valid-rmse:0.42418
[204]	train-rmse:0.39916	valid-rmse:0.42417
[205]	train-rmse:0.39893	valid-rmse:0.42410
[206]	train-rmse:0.39874	valid-rmse:0.42405
[207]	train-rmse:0.39859	valid-rmse:0.42404
[208]	train-rmse:0.39818	valid-r

[373]	train-rmse:0.37714	valid-rmse:0.42012
[374]	train-rmse:0.37696	valid-rmse:0.42007
[375]	train-rmse:0.37680	valid-rmse:0.42005
[376]	train-rmse:0.37661	valid-rmse:0.41998
[377]	train-rmse:0.37640	valid-rmse:0.41997
[378]	train-rmse:0.37624	valid-rmse:0.41995
[379]	train-rmse:0.37617	valid-rmse:0.41993
[380]	train-rmse:0.37613	valid-rmse:0.41993
[381]	train-rmse:0.37604	valid-rmse:0.41994
[382]	train-rmse:0.37602	valid-rmse:0.41993
[383]	train-rmse:0.37598	valid-rmse:0.41993
[384]	train-rmse:0.37589	valid-rmse:0.41990
[385]	train-rmse:0.37582	valid-rmse:0.41989
[386]	train-rmse:0.37581	valid-rmse:0.41989
[387]	train-rmse:0.37573	valid-rmse:0.41988
[388]	train-rmse:0.37569	valid-rmse:0.41987
[389]	train-rmse:0.37565	valid-rmse:0.41988
[390]	train-rmse:0.37546	valid-rmse:0.41982
[391]	train-rmse:0.37541	valid-rmse:0.41982
[392]	train-rmse:0.37520	valid-rmse:0.41977
[393]	train-rmse:0.37517	valid-rmse:0.41977
[394]	train-rmse:0.37495	valid-rmse:0.41976
[395]	train-rmse:0.37493	valid-r

### 테스트 데이터 가져오기

In [10]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'), allow_pickle=True)
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'), allow_pickle=True)
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'), allow_pickle=True)

### 예측하기

In [11]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1) 
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [12]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate': test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)