# Sentiment Analysis with IMDB dataset
## 4. Modeling
### 4.3 RNN
### Data load

In [9]:
import numpy as np
import json

DATA_IN_PATH = 'C:/python/NLP/Chap_4/data_for_modeling/'
DATA_OUT_PATH = 'C:/python/NLP/Chap_4/submit/'

INPUT_TRAIN_DATA_FILE = 'train_input.npy'
LABEL_TRAIN_DATA_FILE = 'train_label.npy'
DATA_CONFIGS_FILE = 'data_configs.json'

train_input = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA_FILE, 'rb'))
train_label = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA_FILE, 'rb'))

prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS_FILE, 'r') as f:
    prepro_configs = json.load(f)

### Data split

In [10]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.1
RANDOM_SEED = 42

X_train, X_dev, Y_train, Y_dev = train_test_split(train_input, train_label, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [11]:
X_train.shape

(22500, 174)

# data(tf.data) -> model(keras.layers) -> training(tf.estimator)
### Define data function - tf.data

In [12]:
import tensorflow as tf

BATCH_SIZE = 16
NUM_EPOCHS = 3

def mapping_fn(X, Y):
    inputs, labels = {'x': X}, Y
    
    return inputs, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
    dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((X_dev, Y_dev))
    dataset = dataset.map(mapping_fn)
    dataset = dataset.batch(BATCH_SIZE)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

### Modeling - tf.keras.layers

In [13]:
prepro_configs['vocab_size']

74066

In [14]:
VOCAB_SIZE = prepro_configs['vocab_size']

WORD_EMBEDDING_DIM = 100
HIDDEN_STATE_DIM = 150
DENSE_FEATURE_DIM = 150

learning_rate = 0.001

def model_fn(features, labels, mode):
    
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    embedding_layer = tf.keras.layers.Embedding(VOCAB_SIZE, WORD_EMBEDDING_DIM)(features['x'])
    embedding_layer = tf.keras.layers.Dropout(0.2)(embedding_layer)
    
    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]]
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    
    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, 
                                       inputs=embedding_layer,
                                       dtype=tf.float32) # without for loop
    outputs = tf.keras.layers.Dropout(0.2)(outputs)
    
    hidden_layer = tf.keras.layers.Dense(DENSE_FEATURE_DIM, activation=tf.nn.tanh)(outputs[:, -1, :])
    hidden_layer = tf.keras.layers.Dropout(0.2)(hidden_layer)
    
    logits = tf.keras.layers.Dense(1)(hidden_layer)
    logits = tf.squeeze(logits, axis=-1)
    
    sigmoid_logits = tf.nn.sigmoid(logits)
    
    if PREDICT:
        predictions = {'sentiment': sigmoid_logits}
        
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    loss = tf.losses.sigmoid_cross_entropy(labels, logits)
    
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(sigmoid_logits))
        eval_metric_ops = {'acc': accuracy}
        
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step)
        
        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss)

### Training, Evaluating and Predicting - tf.estimator

In [15]:
import os

if not os.path.exists(DATA_OUT_PATH):
    os.mkdir(DATA_OUT_PATH)

est = tf.estimator.Estimator(model_fn, model_dir=DATA_OUT_PATH + 'checkpoint')

##### Train

In [16]:
est.train(train_input_fn)

W0719 16:27:16.595093 10124 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow\python\training\saver.py:960: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.


<tensorflow_estimator.python.estimator.estimator.Estimator at 0x21b01c0d048>

##### Eval

In [17]:
est.evaluate(eval_input_fn)

{'acc': 0.8656, 'loss': 0.3385089, 'global_step': 8442}

##### Test

In [18]:
test_input_data = np.load(open(DATA_IN_PATH + 'test_input.npy', 'rb'))

### test_input_data[

In [19]:
TEST_INPUT_DATA = 'test_input.npy'

test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))

predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": np.array(test_input_data)}, shuffle=False)

In [20]:
test_input_data[115,133]

3129

In [21]:
predictions = np.array([p['sentiment'] for p in est.predict(input_fn=predict_input_fn)])

W0719 17:35:42.082572 10124 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W0719 17:35:42.092564 10124 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W0719 17:35:42.677980 10124 deprecation.py:323] From C:\Anaconda_\lib\site-packages\tensorflow\python\training\monitored_session.py:875: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future v

In [22]:
import pandas as pd

TEST_ID_DATA = 'test_id.npy'
test_id = np.load(open(DATA_IN_PATH + TEST_ID_DATA, 'rb'))
ouput = pd.DataFrame(data = {"id": test_id, "sentiment": list(predictions)})
ouput.to_csv(DATA_OUT_PATH + "rnn_predict.csv", index=False, quoting=3)