In [2]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.model_selection import train_test_split

In [3]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))


In [4]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1)

In [5]:
print(train_q1_data.shape, train_q2_data.shape)

(298526, 31) (298526, 31)


In [6]:
print(train_input.shape)

(298526, 2, 31)


In [7]:
print(train_input[0])

[[   2   11    1 3956   10 9444 3340 4020    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   2   11    1 3956   10 9444 3340 4020    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]]


In [9]:
train_input, eval_input, train_label, eval_label = \
    train_test_split(train_input, train_labels, test_size=0.2, random_state=4242)

conda install -c anaconda py-xgboost 설치

In [10]:
import xgboost as xgb

In [12]:
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label)
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label)

data_list = [(train_data, 'train'), (eval_data, 'valid')]

In [13]:
print(data_list[0])

(<xgboost.core.DMatrix object at 0x000001E60EDA8D00>, 'train')


In [15]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse' #root mean squatr error

bst = xgb.train(params, train_data, num_boost_round=1000, evals=data_list,
               early_stopping_rounds=10)

[0]	train-rmse:0.48758	valid-rmse:0.48773
[1]	train-rmse:0.47938	valid-rmse:0.47973
[2]	train-rmse:0.47332	valid-rmse:0.47365
[3]	train-rmse:0.46835	valid-rmse:0.46898
[4]	train-rmse:0.46485	valid-rmse:0.46566
[5]	train-rmse:0.46223	valid-rmse:0.46311
[6]	train-rmse:0.45989	valid-rmse:0.46064
[7]	train-rmse:0.45780	valid-rmse:0.45873
[8]	train-rmse:0.45637	valid-rmse:0.45742
[9]	train-rmse:0.45502	valid-rmse:0.45612
[10]	train-rmse:0.45367	valid-rmse:0.45484
[11]	train-rmse:0.45252	valid-rmse:0.45369
[12]	train-rmse:0.45165	valid-rmse:0.45282
[13]	train-rmse:0.45089	valid-rmse:0.45212
[14]	train-rmse:0.45006	valid-rmse:0.45133
[15]	train-rmse:0.44950	valid-rmse:0.45076
[16]	train-rmse:0.44812	valid-rmse:0.44950
[17]	train-rmse:0.44771	valid-rmse:0.44904
[18]	train-rmse:0.44729	valid-rmse:0.44865
[19]	train-rmse:0.44706	valid-rmse:0.44839
[20]	train-rmse:0.44656	valid-rmse:0.44789
[21]	train-rmse:0.44633	valid-rmse:0.44767
[22]	train-rmse:0.44606	valid-rmse:0.44744
[23]	train-rmse:0.445

In [16]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'), allow_pickle=True)

In [17]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1)
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [18]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate': test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)