# Sequence classification by RNN

* Creating the **data pipeline** with `tf.data`
* Preprocessing word sequences (variable input sequence length) using `tf.keras.preprocessing`
* Using `tf.nn.embedding_lookup` for getting vector of tokens (eg. word, character)
* Creating the model as **Class**
* Reference
  * https://github.com/golbin/TensorFlow-Tutorials/blob/master/10%20-%20RNN/02%20-%20Autocomplete.py
  * https://github.com/aisolab/TF_code_examples_for_Deep_learning/blob/master/Tutorial%20of%20implementing%20Sequence%20classification%20with%20RNN%20series.ipynb

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

import tensorflow as tf
from tensorflow.keras import layers
tf.enable_eager_execution()

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

## Prepare example data

In [None]:
x_train_words = ['good', 'bad', 'amazing', 'so good', 'bull shit',
                 'awesome', 'how dare', 'very much', 'nice', 'god damn it',
                 'very very very happy', 'what the fuck']
y_train = np.array([0, 1, 0, 0, 1,
                    0, 1, 0, 0, 1,
                    0, 1], dtype=np.int32)

In [None]:
# positive sample
index = 0
print("word: {}\nlabel: {}".format(x_train_words[index], y_train[index]))

In [None]:
# negative sample
index = 1
print("word: {}\nlabel: {}".format(x_train_words[index], y_train[index]))

## Tokenizer

In [None]:
tokenizer = Tokenizer(char_level=True)

In [None]:
%%time
tokenizer.fit_on_texts(x_train_words)

In [None]:
num_chars = len(tokenizer.word_index) + 1
print("number of characters: {}".format(num_chars))

In [None]:
tokenizer.word_index

In [None]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_words)

In [None]:
index = 2
print("text: {}".format(x_train_words[index]))
print("token: {}".format(x_train_tokens[index]))

In [None]:
x_train_seq_length = np.array([len(tokens) for tokens in x_train_tokens], dtype=np.int32)
num_seq_length = x_train_seq_length

In [None]:
max_seq_length = np.max(num_seq_length)
print(max_seq_length)

### Create pad_seq data

In [None]:
pad = 'pre'
#pad = 'post'

In [None]:
x_train_pad = pad_sequences(sequences=x_train_tokens, maxlen=max_seq_length,
                            padding=pad, truncating=pad)

In [None]:
index = 7
print("text: {}\n".format(x_train_words[index]))
print("token: {}\n".format(x_train_tokens[index]))
print("pad: {}".format(x_train_pad[index]))

### Tokenizer Inverse Map

In [None]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))
print(inverse_map)

In [None]:
def tokens_to_string(tokens):
  # Map from tokens back to words.
  words = [inverse_map[token] for token in tokens if token != 0]

  # Concatenate all words.
  text = "".join(words)

  return text

In [None]:
index = 10
print("original text:\n{}\n".format(x_train_words[index]))
print("tokens to string:\n{}".format(tokens_to_string(x_train_tokens[index])))

## Create the Recurrent Neural Network

We are now ready to create the Recurrent Neural Network (RNN). We will use the TensorFlow API.

In [None]:
# Set the hyperparameter set
batch_size = 4
max_epochs = 50
#embedding_size = 8
num_units = 16 # the number of nodes in RNN hidden layer
num_classes = 2 # Two classes [True, False]
initializer_scale = 0.1
learning_rate = 1e-3

### Set up dataset with `tf.data`

#### create input pipeline with `tf.data.Dataset`

In [None]:
## create data pipeline with tf.data
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_pad, x_train_seq_length, y_train))
train_dataset = train_dataset.shuffle(buffer_size = 100)
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(batch_size = batch_size)
print(train_dataset)

### Define CharRNN class

In [None]:
model = tf.keras.Sequential([
          layers.Embedding(num_chars, num_chars, embeddings_initializer='identity', trainable=False),
          layers.SimpleRNN(units=num_units),
          layers.Dense(units=num_classes)])

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate)
loss_history = []

In [None]:
total_steps = int( len(x_train_words) / batch_size * max_epochs)
for (step, (seq_pad, seq_length, labels)) in enumerate(train_dataset.take(total_steps)): # just steps number (iterations), NOT epochs
  start_time = time.time()
  with tf.GradientTape() as tape:
    logits = model(seq_pad)    
    loss_value = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.one_hot(labels, depth=num_classes),
                                                 logits=logits)
    

  loss_history.append(loss_value.numpy())
  grads = tape.gradient(loss_value, model.variables)
  optimizer.apply_gradients(zip(grads, model.variables),
                            global_step=tf.train.get_or_create_global_step())
  
  if step % 3 == 0:
      clear_output(wait=True)
      duration = time.time() - start_time
      examples_per_sec = batch_size / float(duration)
      epochs = batch_size * step / float(len(x_train_words))
      print("epochs: {:.2f}, step: {}, loss: {:g}, ({:.2f} examples/sec; {:.3f} sec/batch)".format(epochs+1, step, loss_value, examples_per_sec, duration))
    
print("training done!")

### Plot the loss

In [None]:
loss_history = np.array(loss_history)
plt.plot(loss_history, label='train')

### Train accuracy and predcition

In [None]:
train_dataset_eval = tf.data.Dataset.from_tensor_slices((x_train_pad, x_train_seq_length, y_train))
train_dataset_eval = train_dataset_eval.batch(batch_size = len(x_train_pad))

In [None]:
accuracy = tf.contrib.eager.metrics.Accuracy()

for (step, (seq_pad, seq_length, labels)) in enumerate(train_dataset.take(1)):
  logits = model(seq_pad)
  accuracy(labels=labels, predictions=tf.cast(tf.argmax(logits, 1), tf.int32))
  
print("test accuracy: {}".format(accuracy.result()))

In [None]:
for (step, (seq_pad, seq_length, labels)) in enumerate(train_dataset_eval.take(1)):
  logits = model(seq_pad)
  predictions = tf.cast(tf.argmax(logits, 1), tf.int32)

In [None]:
predictions

In [None]:
for x, y in zip(seq_pad, predictions):
  if y.numpy() == 0:
    print("{} : positive".format(tokens_to_string(x.numpy())))
  else:
    print("{} : negative".format(tokens_to_string(x.numpy())))