### 1-D CNN Model

In [22]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
import json

### 1. Data load

In [23]:
if not os.path.exists('C:/python/NLP/Chap_5/CNN_output'):
    os.mkdir('C:/python/NLP/Chap_5/CNN_output')

DATA_IN_PATH = 'C:/python/NLP/Chap_5/data_for_modeling/'
DATA_OUT_PATH = 'C:/python/NLP/Chap_5/CNN_output/'

TRAIN_Q1_DATA_FILE = 'q1_train.npy'
TRAIN_Q2_DATA_FILE = 'q2_train.npy'
TRAIN_LABEL_DATA_FILE = 'label_train.npy'
DATA_CONFIGS = 'data_configs.json'


In [24]:
q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [25]:
q1_data.shape

(298526, 31)

In [26]:
TEST_RATIO = 0.1
RANDOM_SEED = 42

X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=TEST_RATIO, random_state=RANDOM_SEED)

In [27]:
train_Q1 = train_X[:, 0]
train_Q2 = train_X[:, 1]
dev_Q1 = dev_X[:, 0]
dev_Q2 = dev_X[:, 1]

### 2. Data preparation

In [28]:
# def init(seed=30):
#     tf.global_variables_initializer()
#     tf.reset_default_graph()
#     tf.set_random_seed(seed)
#     np.random.seed(seed)
    
# # init()

In [29]:
def rearrange(base, hypothesis, labels):
    features = {"x1": base, "x2": hypothesis}
    return features, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(16)
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat(EPOCH)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((dev_Q1, dev_Q2, dev_y))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(16)
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

### 3. Network

In [30]:
def basic_conv_semantic_network(inputs, name):
    
    conv_layer = tf.keras.layers.Conv1D(CONV_FEATURE_DIM,
                                        CONV_WINDOW_SIZE,
                                        activation=tf.nn.relu,
                                        name=name + 'conv_1d',
                                        padding='same')(inputs)
    
    max_pool_layer = tf.keras.layers.MaxPool1D(MAX_POOL_WINDOW_SIZE, 1)(conv_layer)
    
    output_layer = tf.keras.layers.Dense(CONV_OUTPUT_FEATURE_SIZE,
                                         activation=tf.nn.relu,
                                         name=name+ 'dense')(max_pool_layer)
    
    return output_layer

# Detail of each layers is not defined yet

### 4. Modeling

In [31]:
VOCAB_SIZE = prepro_configs['vocab_size']
WORD_EMBEDDING_DIM = 128
CONV_FEATURE_DIM = 64  
CONV_WINDOW_SIZE = 3    # n-gram model
MAX_POOL_WINDOW_SIZE = 2
CONV_OUTPUT_FEATURE_SIZE = 64
SIMILARITY_DENSE_FEATURE_DIM = 64
EPOCH = 1


def model_fn(features, labels, mode):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    embedding = tf.keras.layers.Embedding(VOCAB_SIZE, WORD_EMBEDDING_DIM)
    base_embedded_matrix = embedding(features['x1'])
    hypothesis_embedded_matrix = embedding(features['x2'])
    
    base_embedded_matrix = tf.keras.layers.Dropout(0.2)(base_embedded_matrix)
    hypothesis_embedded_matrix = tf.keras.layers.Dropout(0.2)(base_embedded_matrix)
    
    
    base_semantic_matrix = basic_conv_semantic_network(base_embedded_matrix, 'base')
    hypothesis_semantic_matrix = basic_conv_semantic_network(hypothesis_embedded_matrix, 'hypothesis')
   
    merged_matrix = tf.concat([base_semantic_matrix, hypothesis_semantic_matrix], -1)
    
    similarity_dense_layer = tf.keras.layers.Flatten()(merged_matrix)
    similarity_dense_layer = tf.keras.layers.Dense(SIMILARITY_DENSE_FEATURE_DIM, activation=tf.nn.relu)(similarity_dense_layer)  # Faltten
    similarity_dense_layer = tf.keras.layers.Dropout(0.2)(similarity_dense_layer)
    
    logit_layer = tf.keras.layers.Dense(1)(similarity_dense_layer)
    
    logit_layer = tf.squeeze(logit_layer, 1)
    similarity = tf.nn.sigmoid(logit_layer)
    
    
    
    if PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions={'is_duplicate': similarity})
    
    loss = tf.losses.sigmoid_cross_entropy(labels, similarity) 
    
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(similarity))
        return tf.estimator.EstimatorSpec(mode=mode,
                                          eval_metric_ops={'acc': accuracy},
                                          loss=loss)
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          train_op=train_op,
                                          loss=loss)

### TRAIN

In [32]:
est = tf.estimator.Estimator(model_fn, model_dir=DATA_OUT_PATH + 'checkpoint')

In [33]:
est.train(train_input_fn)

(?, 3840)


KeyboardInterrupt: 

In [34]:
est.evaluate(eval_input_fn)

(?, 3840)


W1004 13:56:44.752437  2996 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow\python\training\saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


{'acc': 0.66063714, 'loss': 0.64759374, 'global_step': 3317}

### TEST

In [35]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))

In [36]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x1': test_q1_data, 'x2': test_q2_data}, shuffle=False)

predictions = np.array([p['is_duplicate'] for p in est.predict(input_fn=predict_input_fn)])

W1004 13:57:12.979921  2996 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W1004 13:57:12.979921  2996 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


(?, 3840)


W1004 13:57:13.559716  2996 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow\python\training\monitored_session.py:875: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [37]:
output = pd.DataFrame(data={"test_id": test_id_data, "is_duplicate": list(predictions)})
output.to_csv(DATA_OUT_PATH + "cnn_predict.csv", index=False, quoting=3)

In [38]:
predictions.shape

(2345796,)

In [37]:
# output[(output['is_duplicate'] != 1)]

In [39]:
output.head(50)

Unnamed: 0,test_id,is_duplicate
0,0,0.9940332
1,1,0.0
2,2,0.9998206
3,3,0.01562777
4,4,0.9535001
5,5,0.0
6,6,0.9999331
7,7,1.0
8,8,0.9999844
9,9,0.0
