# Sequence to Sequence Classification by RNN

- Creating the **data pipeline** with `tf.data`
- Preprocessing word sequences (variable input sequence length) using `padding technique` by `user function (pad_seq)`
- Using `tf.nn.embedding_lookup` for getting vector of tokens (eg. word, character)
- Training **many to many classification** with `tf.contrib.seq2seq.sequence_loss`
- Masking unvalid token with `tf.sequence_mask`
- Creating the model as **Class**

In [None]:
import os
import sys
import time
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

slim = tf.contrib.slim
rnn = tf.contrib.rnn

sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))

## Prepare example data 

In [None]:
sentences = [['I', 'feel', 'hungry'],
             ['You', 'are', 'a', 'genius'],
             ['tensorflow', 'is', 'very', 'difficult'],
             ['tensorflow', 'is', 'a', 'framework', 'for', 'deep', 'learning'],
             ['tensorflow', 'is', 'very', 'fast', 'changing']]
pos = [['pronoun', 'verb', 'adjective'],
       ['pronoun', 'verb', 'preposition', 'noun'],
       ['noun', 'verb', 'adverb', 'adjective'],
       ['noun', 'verb', 'determiner', 'noun', 'preposition', 'adjective', 'noun'],
       ['noun', 'verb', 'adverb', 'adjective', 'verb']]

In [None]:
# word dictionary
bag_of_words = []
for sentence in sentences:
  bag_of_words += sentence
bag_of_words = list(set(bag_of_words))
bag_of_words.sort()
bag_of_words = ['<pad>'] + bag_of_words

word2idx = {word : idx for idx, word in enumerate(bag_of_words)} # word to index
idx2word = [word for word in bag_of_words] # index to word

In [None]:
#print("word2idx: {}".format(word2idx))
word2idx

In [None]:
#print("idx2word: {}".format(idx2word))
idx2word

In [None]:
# pos dictionary
bag_of_pos = []
for item in pos:
  bag_of_pos += item
bag_of_pos = list(set(bag_of_pos))
bag_of_pos.sort()
bag_of_pos = ['<pad>'] + bag_of_pos
print("bag_of_pos: {}".format(bag_of_pos))

pos2idx = {pos : idx for idx, pos in enumerate(bag_of_pos)} # pos to index
idx2pos = [pos for pos in bag_of_pos] # index to pos

In [None]:
#print("pos2idx: {}".format(pos2idx))
pos2idx

In [None]:
#print("idx2pos: {}".format(idx2pos))
idx2pos

### Create pad_seq function

In [None]:
def pad_seq(sequences, max_length, dic):
  """Padding sequences
  Padding a special charcter '<pad>' from the end of sentence to max_length
  
  Args:
    sequences (list of characters): input data
    max_length (int): max length for padding
    dic (dictionary): char to index
  
  Returns:
    seq_indices (2-rank np.array): 
    seq_length (1-rank np.array): sequence lengthes of all data
  """
  seq_length, seq_indices = [], []
  for sequence in sequences:
    seq_length.append(len(sequence))
    seq_idx = [dic.get(char) for char in sequence]
    seq_idx += (max_length - len(seq_idx)) * [dic.get('<pad>')] # 0 is idx of meaningless token "<pad>"
    seq_indices.append(seq_idx)
  return np.array(seq_indices), np.array(seq_length)

### Pre-process data

In [None]:
max_length = 10
X_indices, X_length = pad_seq(sequences=sentences, max_length=max_length, dic=word2idx)

In [None]:
print("X_indices")
print(X_indices)
print("X_length")
print(X_length)

In [None]:
y_string = np.array([item + ['<pad>'] * (max_length - len(item)) for item in pos])
print(y_string)

In [None]:
y = np.array([list(map(lambda el : pos2idx.get(el), item)) for item in y_string])
print(y)

### Define SimPosRNN

In [None]:
class PosRNN:
  def __init__(self, seq_indices, seq_length, labels, num_classes, hidden_dim, max_length, word2idx):
    # Data pipeline
    with tf.variable_scope('input_layer'):
      self._seq_indices = seq_indices
      self._seq_length = seq_length
      self._labels = labels

      one_hot = tf.eye(len(word2idx), dtype=tf.float32)
      self._one_hot = tf.get_variable(name='one_hot_embedding',
                                      initializer=one_hot,
                                      trainable=False) # embedding vector training 안할 것이기 때문
      self._seq_embeddings = tf.nn.embedding_lookup(params=self._one_hot,
                                                    ids=self._seq_indices)

    # bidirectional LSTM cell (many to many)
    with tf.variable_scope('rnn_cell'):
      cell_fw = rnn.BasicLSTMCell(num_units=hidden_dim, state_is_tuple=True)
      cell_bw = rnn.BasicLSTMCell(num_units=hidden_dim, state_is_tuple=True)
      outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,
                                                   self._seq_embeddings,
                                                   sequence_length=self._seq_length,
                                                   dtype=tf.float32)
      concat_outputs = tf.concat([outputs[0], outputs[1]], axis=2)
      
      weights = tf.get_variable(name='weights', shape=[2 * hidden_dim, num_classes],
                                initializer=slim.xavier_initializer())
      self._logits = tf.map_fn(lambda elm : tf.matmul(elm, weights), concat_outputs)

    with tf.variable_scope('seq2seq_loss'):
      masks = tf.sequence_mask(lengths=self._seq_length, maxlen=max_length, dtype=tf.float32)
      self.seq2seq_loss = tf.contrib.seq2seq.sequence_loss(logits=self._logits,
                                                           targets=self._labels,
                                                           weights=masks)

    with tf.variable_scope('prediction'):
      self._prediction = tf.argmax(input=self._logits,
                                   axis=2, output_type=tf.int32)

  def predict(self, sess, seq_indices, seq_length):
    feed_dict = {self._seq_indices : seq_indices, self._seq_length : seq_length}
    return sess.run(self._prediction, feed_dict=feed_dict)

### Create a model of SimPosRNN

In [None]:
# hyper-parameter
num_classes = len(idx2pos)
learning_rate = .003
batch_size = 2
max_epochs = 100

### Set up dataset with `tf.data`

#### create input pipeline with `tf.data.Dataset`

In [None]:
## create data pipeline with tf.data
train_dataset = tf.data.Dataset.from_tensor_slices((X_indices, X_length, y))
train_dataset = train_dataset.shuffle(buffer_size = 100)
train_dataset = train_dataset.batch(batch_size = batch_size)
print(train_dataset)

#### Define Iterator

In [None]:
train_iterator = train_dataset.make_initializable_iterator()
seq_indices, seq_length, labels = train_iterator.get_next()

In [None]:
pos_rnn = PosRNN(seq_indices=seq_indices, seq_length=seq_length,
                 labels=labels, num_classes=num_classes,
                 hidden_dim=16, max_length=max_length,
                 word2idx=word2idx)

### Creat training op and train model

In [None]:
## create training op
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.minimize(pos_rnn.seq2seq_loss)

### `tf.Session()` and train

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

loss_history = []
step = 0
for epochs in range(max_epochs):
  start_time = time.time()
  sess.run(train_iterator.initializer)
  
  avg_loss = []
  while True:
    try:
      _, loss_ = sess.run([train_op, pos_rnn.seq2seq_loss])
      avg_loss.append(loss_)
      step += 1

    except tf.errors.OutOfRangeError:
      #print("End of dataset")  # ==> "End of dataset"
      break

  avg_loss_ = np.mean(avg_loss)
  loss_history.append(avg_loss_)
  
  duration = time.time() - start_time
  examples_per_sec = batch_size / float(duration)
  print("epochs: {}, step: {}, loss: {:g}, ({:.2f} examples/sec; {:.3f} sec/batch)".format(epochs+1, step, avg_loss_, examples_per_sec, duration))

In [None]:
plt.plot(loss_history, label='train')

In [None]:
y_pred = pos_rnn.predict(sess=sess, seq_indices=X_indices, seq_length=X_length)
print(y_pred)

In [None]:
result_str = []
for example in y_pred:
  result_str.append([idx2pos[idx] for idx in example])
  
for examples in zip(y_string, result_str):
  print("        Label: ", ' '.join(examples[0]))
  print("Prediction: ", ' '.join(examples[1]))