# Sequence classification by RNN

- Creating the **data pipeline** with `tf.data`
- Preprocessing word sequences (variable input sequence length) using `padding technique` by `user function (pad_seq)`
- Using `tf.nn.embedding_lookup` for getting vector of tokens (eg. word, character)
- Creating the model as **Class**
- Reference
    - https://github.com/golbin/TensorFlow-Tutorials/blob/master/10%20-%20RNN/02%20-%20Autocomplete.py
    - https://github.com/aisolab/TF_code_examples_for_Deep_learning/blob/master/Tutorial%20of%20implementing%20Sequence%20classification%20with%20RNN%20series.ipynb


In [None]:
import os
import sys
import time
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

slim = tf.contrib.slim
rnn = tf.contrib.rnn

sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))

## Prepare example data

In [None]:
words = ['good', 'bad', 'amazing', 'so good', 'bull shit', 'awesome', 'how dare', 'very much', 'nice']
y = np.array([[1.,0.], [0.,1.], [1.,0.], [1.,0.], [0.,1.], [1.,0.], [0.,1.], [1.,0.], [1.,0.]])

In [None]:
# Character quantization
char_space = string.ascii_lowercase 
char_space = char_space + ' ' + '*' # '*' means padding token
print("char_space: {}".format(char_space))

In [None]:
idx2char = [char for char in char_space]
print("idx2char: {}".format(idx2char))

In [None]:
char2idx = {char : idx for idx, char in enumerate(char_space)}
print("char2idx: {}".format(char2idx))

### Create pad_seq function

In [None]:
def pad_seq(sequences, max_length, dic):
  """Padding sequences
  
  Args:
    sequences (list of characters): input data
    max_length (int): max length for padding
    dic (dictionary): char to index
  
  Returns:
    seq_indices (2-rank np.array): 
    seq_length (1-rank np.array): sequence lengthes of all data
  """
  seq_length, seq_indices = [], []
  for sequence in sequences:
    seq_length.append(len(sequence))
    seq_idx = [dic.get(char) for char in sequence]
    seq_idx += (max_length - len(seq_idx)) * [dic.get('*')] # 27 is idx of meaningless token "*"
    seq_indices.append(seq_idx)
  return np.array(seq_indices), np.array(seq_length)

### Apply pad_seq function to data

In [None]:
max_length = 10
X_indices, X_length = pad_seq(sequences=words, max_length=max_length, dic=char2idx)

In [None]:
print("X_indices")
print(X_indices)
print("X_length")
print(X_length)

## Define CharRNN class

In [None]:
class CharRNN:
  def __init__(self, seq_indices, seq_length, labels, num_classes, hidden_dims, dic):
    # data pipeline
    with tf.variable_scope('input_layer'):
      self._seq_indices = seq_indices
      self._seq_length = seq_length
      self._labels = labels

      one_hot = tf.eye(len(dic), dtype=tf.float32)
      self._one_hot = tf.get_variable(name='one_hot_embedding',
                                      initializer=one_hot,
                                      trainable=False) # embedding vector training 안할 것이기 때문
      self._seq_embeddings = tf.nn.embedding_lookup(params=self._one_hot,
                                                    ids=self._seq_indices)

    # MultiLayer LSTM cell
    with tf.variable_scope('multi_lstm_cell'):
      multi_cells = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_units=hidden_dim, state_is_tuple=True) \
                                      for hidden_dim in hidden_dims])
      
      _, states = tf.nn.dynamic_rnn(cell=multi_cells, inputs=self._seq_embeddings,
                                    sequence_length=self._seq_length, dtype=tf.float32)

    with tf.variable_scope('output_layer'):
      self._logits = slim.fully_connected(inputs=states[-1].h,
                                          num_outputs=num_classes,
                                          activation_fn=None)

    with tf.variable_scope('loss'):
      self.loss = tf.losses.softmax_cross_entropy(onehot_labels=self._labels,
                                                  logits=self._logits)

    with tf.variable_scope('prediction'):
      self._prediction = tf.argmax(input=self._logits, axis=-1, output_type=tf.int32)

  def predict(self, sess, seq_indices, seq_length):
    feed_dict = {self._seq_indices : seq_indices, self._seq_length : seq_length}
    return sess.run(self._prediction, feed_dict=feed_dict)

### Create a model of CharRNN

In [None]:
# hyper-parameters
num_classes = 2
learning_rate = 0.003
batch_size = 2
max_epochs = 20

#### Print dataset

In [None]:
print("X_indices: \n{}".format(X_indices))
print("X_length: {}".format(X_length))
print("y: \n{}".format(y))

### Set up dataset with `tf.data`

#### create input pipeline with `tf.data.Dataset`

In [None]:
## create data pipeline with tf.data
train_dataset = tf.data.Dataset.from_tensor_slices((X_indices, X_length, y))
train_dataset = train_dataset.shuffle(buffer_size = 100)
train_dataset = train_dataset.batch(batch_size = batch_size)
print(train_dataset)

#### Define Iterator

In [None]:
train_iterator = train_dataset.make_initializable_iterator()
seq_indices, seq_length, labels = train_iterator.get_next()

In [None]:
char_rnn = CharRNN(seq_indices=seq_indices, seq_length=seq_length,
                   labels=labels, num_classes=num_classes,
                   hidden_dims=[32, 16], dic=char2idx)

### Creat training op and train model

In [None]:
## create training op
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.minimize(char_rnn.loss)

### `tf.Session()` and train

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

loss_history = []
step = 0
for epochs in range(max_epochs):
  start_time = time.time()
  sess.run(train_iterator.initializer)
  
  avg_loss = []
  while True:
    try:
      _, loss_ = sess.run([train_op, char_rnn.loss])
      avg_loss.append(loss_)
      step += 1

    except tf.errors.OutOfRangeError:
      #print("End of dataset")  # ==> "End of dataset"
      break

  avg_loss_ = np.mean(avg_loss)
  loss_history.append(avg_loss_)
  
  duration = time.time() - start_time
  examples_per_sec = batch_size / float(duration)
  print("epochs: {}, step: {}, loss: {:g}, ({:.2f} examples/sec; {:.3f} sec/batch)".format(epochs+1, step, avg_loss_, examples_per_sec, duration))

In [None]:
plt.plot(loss_history, label='train')

In [None]:
y_pred = char_rnn.predict(sess=sess, seq_indices=X_indices, seq_length=X_length)

In [None]:
accuracy = np.mean(y_pred==np.argmax(y, axis=-1))
print('training accuracy: {:.2%}'.format(accuracy))