# Quora Questions Pairs 유사도 - XGBoost
- 데이터 전처리 : quora_preprocessing.ipynb

## 구글 드라이브 마운트

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 라이브러리 임포트

In [19]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection  import train_test_split
import xgboost as xgb

## 학습 데이터 로드

In [20]:
DATA_PATH = '/content/drive/MyDrive/nlpdata/quora/'

train_q1_data = np.load(open(DATA_PATH + 'train_q1.npy', 'rb'))
train_q2_data = np.load(open(DATA_PATH + 'train_q2.npy', 'rb'))

X = np.stack((train_q1_data, train_q2_data), axis=1) 
y = np.load(open(DATA_PATH + 'train_label.npy', 'rb'))

## 훈련 셋과 평가 셋 나누기¶

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 모델 구성

In [22]:
train_data = xgb.DMatrix(X_train.sum(axis=1), label=y_train) 
test_data = xgb.DMatrix(X_test.sum(axis=1), label=y_test) 

data_list = [(train_data, 'train'), (test_data, 'test')]

In [23]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse'

bst = xgb.train(params, train_data, num_boost_round = 100, evals = data_list, early_stopping_rounds=10)

[0]	train-rmse:0.478236	test-rmse:0.478762
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 10 rounds.
[1]	train-rmse:0.465608	test-rmse:0.466228
[2]	train-rmse:0.457886	test-rmse:0.458722
[3]	train-rmse:0.452842	test-rmse:0.453982
[4]	train-rmse:0.449562	test-rmse:0.450801
[5]	train-rmse:0.447257	test-rmse:0.448653
[6]	train-rmse:0.444446	test-rmse:0.44597
[7]	train-rmse:0.442999	test-rmse:0.444649
[8]	train-rmse:0.4413	test-rmse:0.443134
[9]	train-rmse:0.440273	test-rmse:0.442274
[10]	train-rmse:0.438797	test-rmse:0.440924
[11]	train-rmse:0.438153	test-rmse:0.440409
[12]	train-rmse:0.437116	test-rmse:0.43954
[13]	train-rmse:0.436382	test-rmse:0.438935
[14]	train-rmse:0.435964	test-rmse:0.438636
[15]	train-rmse:0.435485	test-rmse:0.438263
[16]	train-rmse:0.433116	test-rmse:0.436234
[17]	train-rmse:0.432833	test-rmse:0.436064
[18]	train-rmse:0.432391	test-rmse:0.435759
[19]	train-rmse:0.431282	test-rmse:

## 예측하기

In [25]:
## 기존의 np.load를 np_load_old에 저장해둠.
np_load_old = np.load

## 기존의 parameter을 바꿔줌
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

In [26]:
test_q1_data = np.load(open(DATA_PATH + 'test_q1.npy', 'rb'))
test_q2_data = np.load(open(DATA_PATH + 'test_q2.npy''', 'rb'))
test_id_data = np.load(open(DATA_PATH + 'test_id.npy', 'rb'))

test_input = np.stack((test_q1_data, test_q2_data), axis=1) 
test_data = xgb.DMatrix(test_input.sum(axis=1))

pred = bst.predict(test_data)

In [27]:
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate': pred})
output.to_csv(DATA_PATH + 'quora_xgb.csv', index=False)

## kgggle에 결과 제출 및 스코어 확인
- https://www.kaggle.com/c/quora-question-pairs