In [15]:
import sys
import tensorflow.compat.v1 as tf
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split

import json

# Initial global var

In [16]:
## 미리 Global 변수를 지정하자. 파일 명, 파일 위치, 디렉토리 등이 있다.

DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.

BATCH_SIZE = 16
EPOCH = 2
HIDDEN = 64
BUFFER_SIZE = 10000

NUM_LAYERS = 3
DROPOUT_RATIO = 0.3

TEST_SPLIT = 0.1
RNG_SEED = 13371447
EMBEDDING_DIM = 128
MAX_SEQ_LEN = 31

# Load Dataset

In [17]:
## 데이터를 불러오는 부분이다. 효과적인 데이터 불러오기를 위해, 미리 넘파이 형태로 저장시킨 데이터를 로드한다.

q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [18]:
VOCAB_SIZE = prepro_configs['vocab_size']

# Split train and test dataset

In [19]:
# 각의 길이를 측정한다.
q1_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q1_data])
q2_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q2_data])

In [20]:
## 데이터를 나누어 저장하자. sklearn의 train_test_split을 사용하면 유용하다. 하지만, 쿼라 데이터의 경우는
## 입력이 1개가 아니라 2개이다. 따라서, np.stack을 사용하여 두개를 하나로 쌓은다음 활용하여 분류한다.

# 학습데이터와 평가데이터를 나누기 위해 합친다.
X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

# 나눴으므로 다시 나눈다.
train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
test_Q1 = test_X[:,0]
test_Q2 = test_X[:,1]

In [21]:
# 매핑 함수
def rearrange(base, hypothesis, labels):
    features = {"base": base, "hypothesis": hypothesis}
    return features, labels

# 학습 데이터 입력 함수
def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=len(train_Q1))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat(EPOCH)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

# 평가 데이터 입력 함수
def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

# Model setup

In [22]:
# 모델 함수

def Malstm(features, labels, mode):
        
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
           
    # 입력값을 임베딩 벡터로 바꾼다.    
    embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
    
    base_embedded_matrix = embedding(features['base'])
    hypothesis_embedded_matrix = embedding(features['hypothesis'])
    
    # 2개의 LSTM층 정의
    
    # Question 1에 대한 LSTM모델
    
    # 정방향 LSTM층 정의
    q_lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    # 역방향 LSTM층 정의
    q_lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    
    # 두개의 데이터를 적용한다.
    # bidirectional_dynamic_rnn 함수를 쓰면 양방향 LSTM을 구현할 수 있다.
    _, q_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw = q_lstm_fw_cell,
                                                       cell_bw = q_lstm_bw_cell,
                                                       inputs = base_embedded_matrix,
                                                       dtype = tf.float32,
                                                      scope='query')
    # 마지막 state 값을 뽑아 추출한다
    q_final_state = tf.concat([q_output_states[0].h, q_output_states[1].h], axis=1)

    # Question 2 (유사한 쿼리)에 대한 LSTM모델
    # 첫번째와 동일한 방법으로 생성
    s_lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    s_lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = 64, activation = tf.nn.tanh)
    _, s_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw = s_lstm_fw_cell,
                                                       cell_bw = s_lstm_bw_cell,
                                                       inputs = hypothesis_embedded_matrix,
                                                        dtype = tf.float32,
                                                      scope='sim_query')
    #LSTM의 마지막 state 값을 추출한다.
    sim_final_state = tf.concat([s_output_states[0].h, s_output_states[1].h], axis=1)
        
#     merged_matrix = tf.concat([base_sementic_matrix, hypothesis_sementic_matrix], -1)
#     logit_layer = tf.keras.layers.dot([base_sementic_matrix, hypothesis_sementic_matrix], axes=1, normalize=True)    

    with tf.variable_scope('output_layer'):
#     logit_layer = K.exp(-K.sum(K.abs(base_sementic_matrix - hypothesis_sementic_matrix), axis=1, keepdims=True))

        # 두 벡터 사이의 맨하탄 거리를 계산해서 두 문장 사이의 유사도 측정
        # 두 벡터를 뺀 후 절댓값을 이용
        # 0~1사이 값을 만들기 위해 reduce_sum함수를 사용
        logit_layer = tf.exp(-tf.reduce_sum(tf.abs(q_final_state - sim_final_state), axis=1, keepdims=True))
        logit_layer = tf.squeeze(logit_layer, axis=-1)
                
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'is_duplicate':logit_layer
                  })
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.to_float(labels)
    
#     loss = tf.reduce_mean(tf.keras.metrics.binary_crossentropy(y_true=labels, y_pred=logit_layer))
    loss = tf.losses.mean_squared_error(labels=labels, predictions=logit_layer)
#     loss = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(labels, logit_layer))
    
    # 이전의 합성곱 신경망 모델과 동일
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(logit_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)

    elif TRAIN:

        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

# Training & Eval

In [23]:
# os.environ["CUDA_VISIBLE_DEVICES"]="0" #For TEST  

model_dir = os.path.join(os.getcwd(), DATA_OUT_PATH + "/checkpoint/rnn/")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig(save_checkpoints_steps=500,
                                save_checkpoints_secs=None,
                                  keep_checkpoint_max=2,
                                  log_step_count_steps=200)
# 에스티 메이터 객체 생성
lstm_est = tf.estimator.Estimator(Malstm, model_dir=model_dir, config=config_tf)

INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\jch\\5.TEXT_SIM\\./data_out//checkpoint/rnn/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 200, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [24]:
# 학습
lstm_est.train(train_input_fn)

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:

INFO:tensorflow:global_step/sec: 8.04958
INFO:tensorflow:loss = 0.22710523, step = 5000 (24.845 sec)
INFO:tensorflow:global_step/sec: 8.95928
INFO:tensorflow:loss = 0.15898022, step = 5200 (22.324 sec)
INFO:tensorflow:global_step/sec: 9.26481
INFO:tensorflow:loss = 0.15789142, step = 5400 (21.587 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 5500...
INFO:tensorflow:Saving checkpoints for 5500 into C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 5500...
INFO:tensorflow:global_step/sec: 8.30343
INFO:tensorflow:loss = 0.16087905, step = 5600 (24.092 sec)
INFO:tensorflow:global_step/sec: 8.33903
INFO:tensorflow:loss = 0.22352502, step = 5800 (23.978 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 6000...
INFO:tensorflow:Saving checkpoints for 6000 into C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt.
INFO:tensorflow:Calling checkpoint lis

INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 13000...
INFO:tensorflow:global_step/sec: 8.07182
INFO:tensorflow:loss = 0.10337884, step = 13000 (24.777 sec)
INFO:tensorflow:global_step/sec: 9.22583
INFO:tensorflow:loss = 0.19080359, step = 13200 (21.678 sec)
INFO:tensorflow:global_step/sec: 9.46117
INFO:tensorflow:loss = 0.19325057, step = 13400 (21.139 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 13500...
INFO:tensorflow:Saving checkpoints for 13500 into C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 13500...
INFO:tensorflow:global_step/sec: 7.97218
INFO:tensorflow:loss = 0.17835695, step = 13600 (25.086 sec)
INFO:tensorflow:global_step/sec: 9.26674
INFO:tensorflow:loss = 0.09037018, step = 13800 (21.583 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 14000...
INFO:tensorflow:Saving checkpoints for 14000 into C:\Users\jch\

INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 21000...
INFO:tensorflow:Saving checkpoints for 21000 into C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 21000...
INFO:tensorflow:global_step/sec: 8.35247
INFO:tensorflow:loss = 0.15872784, step = 21000 (23.945 sec)
INFO:tensorflow:global_step/sec: 9.0978
INFO:tensorflow:loss = 0.10133091, step = 21200 (21.983 sec)
INFO:tensorflow:global_step/sec: 9.15005
INFO:tensorflow:loss = 0.20792097, step = 21400 (21.859 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 21500...
INFO:tensorflow:Saving checkpoints for 21500 into C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 21500...
INFO:tensorflow:global_step/sec: 7.31176
INFO:tensorflow:loss = 0.10088998, step = 21600 (27.353 sec)
INFO:tensorflow:global_step/sec: 9.06055
INFO:tensorf

INFO:tensorflow:loss = 0.17553115, step = 28600 (22.682 sec)
INFO:tensorflow:global_step/sec: 9.69532
INFO:tensorflow:loss = 0.094350144, step = 28800 (20.630 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 29000...
INFO:tensorflow:Saving checkpoints for 29000 into C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 29000...
INFO:tensorflow:global_step/sec: 8.33387
INFO:tensorflow:loss = 0.19382627, step = 29000 (23.998 sec)
INFO:tensorflow:global_step/sec: 8.86058
INFO:tensorflow:loss = 0.11388004, step = 29200 (22.572 sec)
INFO:tensorflow:global_step/sec: 8.40671
INFO:tensorflow:loss = 0.18449283, step = 29400 (23.791 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 29500...
INFO:tensorflow:Saving checkpoints for 29500 into C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x253cda79f40>

In [25]:
# 평가
lstm_est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-08-04T18:21:16Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt-33586
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 5.89366s
INFO:tensorflow:Finished evaluation at 2020-08-04-18:21:22
INFO:tensorflow:Saving dict for global step 33586: acc = 0.79017186, global_step = 33586, loss = 0.1469722
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 33586: C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt-33586


{'acc': 0.79017186, 'loss': 0.1469722, 'global_step': 33586}

# Load test dataset & create submit dataset to kaggle

In [28]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'),allow_pickle=True)
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'),allow_pickle=True)
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'),allow_pickle=True)

In [29]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data, 
                                                         "hypothesis":test_q2_data}, 
                                                      shuffle=False)
predictions = np.array([p['is_duplicate'] for p in lstm_est.predict(input_fn=predict_input_fn)])

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\jch\5.TEXT_SIM\./data_out//checkpoint/rnn/model.ckpt-33586
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [30]:
print(len(predictions)) #2345796

output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} )
output.to_csv( "rnn_predict.csv", index=False, quoting=3 )

2345796
