In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import tensorflow as tf

In [3]:
import numpy as np

In [4]:
class DataReader:
  def __init__(self, data_path, batch_size, vocab_size):
    self.batch_size = batch_size
    with open(data_path) as f:
      d_lines = f.read().splitlines()
    
    self.data = []
    self.labels = []
    self.sentence_lengths = []
    # self.final_tokens = []
    for line in d_lines:
      # vector = [0. for i in range(vocab_size)]
      features = line.split('<fff>')
      label, doc_id, length = int(features[0]), int(features[1]), int(features[2])
      tokens = features[3].split()
      for token in tokens:
        token = int(token)
      self.data.append(tokens)
      self.labels.append(label)
      self.sentence_lengths.append(length)
      # self.final_tokens.append(tokens[len(tokens)-1])
    self.data = np.array(self.data)
    self.labels = np.array(self.labels)
    self.sentence_lengths = np.array(self.sentence_lengths)
    # self.final_tokens = np.array(self.final_tokens)

    self.num_epoch = 0
    self.batch_id = 0
  
  def next_batch(self):
    start = self.batch_id * self.batch_size
    end = start + self.batch_size
    self.batch_id += 1

    if end + self.batch_size > len(self.data):
      # end = len(self.data)
      self.num_epoch += 1
      self.batch_id = 0
      indices = np.arange(len(self.data))
      np.random.seed(2018)
      np.random.shuffle(indices)
      self.data, self.labels = self.data[indices], self.labels[indices]
      self.sentence_lengths  = self.sentence_lengths[indices]
      # self.final_tokens = self.final_tokens[indices]

    return self.data[start:end], self.labels[start:end], self.sentence_lengths[start:end]


In [5]:
class RNN:
  def __init__(self, vocab_size, embedding_size, lstm_size, batch_size):
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.lstm_size = lstm_size
    self.batch_size = batch_size

    self.MAX_DOC_LENGTHS = 500
    self.data = tf.placeholder(tf.int32, shape=[batch_size, self.MAX_DOC_LENGTHS])
    self.labels = tf.placeholder(tf.int32, shape=[batch_size,])
    self.sentence_lengths = tf.placeholder(tf.int32, shape=[batch_size,])
    # self.final_tokens = tf.placeholder(tf.int32, shape=[batch_size,])

  def build_graph(self):
    NUM_CLASSES = 20
    embeddings = self.embedding_layer(self.data)

    lstm_outputs = self.LSTM_layer(embeddings)

    weights = tf.get_variable(name='final_layer_weights', 
                              shape=(self.lstm_size, NUM_CLASSES),
                              initializer=tf.random_normal_initializer(seed=2018))
    biases = tf.get_variable(name='final_layer_biases',
                             shape=(NUM_CLASSES),
                             initializer=tf.random_normal_initializer(seed=2018))
    logits = tf.matmul(lstm_outputs, weights) + biases

    labels_one_hot = tf.one_hot(indices=self.labels,
                                depth=NUM_CLASSES,
                                dtype=tf.float32
                                )

    loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_one_hot,
                                                   logits=logits)
    loss = tf.reduce_mean(loss)

    probs = tf.nn.softmax(logits)

    predicted_labels = tf.argmax(probs, axis=1)
    predicted_labels = tf.squeeze(predicted_labels)
    
    return predicted_labels, loss
  def embedding_layer(self, indices):
    pretrained_vectors = [np.zeros(self.embedding_size)]
    np.random.seed(2018)

    for _ in range(self.vocab_size + 1):
      pretrained_vectors.append(np.random.normal(loc=0, scale=1., size=self.embedding_size))
    
    pretrained_vectors = np.array(pretrained_vectors)

    self.embedding_matrix = tf.get_variable(name='embedding', 
                                            shape=(self.vocab_size+2, self.embedding_size),
                                            initializer=tf.constant_initializer(pretrained_vectors))
    
    return tf.nn.embedding_lookup(self.embedding_matrix, indices)
  
  def LSTM_layer(self, embeddings):
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.lstm_size)
    zero_state = tf.zeros(shape=(self.batch_size, self.lstm_size))
    initial_state = tf.contrib.rnn.LSTMStateTuple(zero_state, zero_state)

    lstm_inputs = tf.unstack(tf.transpose(embeddings, perm=[1,0,2]))
    lstm_outputs, last_state = tf.nn.static_rnn(cell=lstm_cell, 
                                                inputs=lstm_inputs,
                                                initial_state=initial_state,
                                                sequence_length=self.sentence_lengths)
    lstm_outputs = tf.unstack(tf.transpose(lstm_outputs, perm=[1,0,2]))
    lstm_outputs = tf.concat(lstm_outputs, axis=0)

    mask = tf.sequence_mask(lengths=self.sentence_lengths,
                            maxlen=self.MAX_DOC_LENGTHS,
                            dtype=tf.float32)
    mask = tf.concat(tf.unstack(mask, axis=0), axis=0)
    mask = tf.expand_dims(mask, -1)

    lstm_outputs = mask*lstm_outputs
    lstm_outputs_split = tf.split(lstm_outputs, num_or_size_splits=self.batch_size)
    lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis=1)
    lstm_outputs_average = lstm_outputs_sum / tf.expand_dims(tf.cast(self.sentence_lengths, tf.float32), -1)

    return lstm_outputs_average

  def trainer(self, loss, learning_rate):
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_op

with open('vocab-raw.txt', 'rb') as f:
  vocab_size = len(f.read().splitlines())

tf.set_random_seed(2018)
rnn = RNN(vocab_size=vocab_size,
            embedding_size=50,
            lstm_size=50,
            batch_size=50)

predicted_labels, loss = rnn.build_graph()
train_op = rnn.trainer(loss=loss, learning_rate=0.1)

with tf.Session() as sess:
  train_data_reader = DataReader(data_path='20news-train-encoded-v2.txt', batch_size=50, vocab_size=vocab_size)
  test_data_reader = DataReader(data_path='20news-test-encoded-v2.txt', batch_size=50, vocab_size=vocab_size)

  step = 0
  MAX_STEP = 5000
    
  sess.run(tf.global_variables_initializer())

  while step < MAX_STEP:
    next_train_batch = train_data_reader.next_batch()
    train_data, train_labels, train_sentence_lengths = next_train_batch  
    plabels_eval, loss_eval, _ = sess.run([predicted_labels, loss, train_op],
                                            feed_dict={
                                                rnn.data: train_data,
                                                rnn.labels: train_labels,
                                                rnn.sentence_lengths: train_sentence_lengths
                                                
                                            })
    step+=1
    if step % 20 == 0:
      print("loss:", loss_eval)
   
    if train_data_reader.batch_id == 0:
      num_true_preds = 0
      while True:
        next_test_batch = test_data_reader.next_batch()
        test_data, test_labels, test_sentence_lengths = next_test_batch

        test_plabels_eval = sess.run(predicted_labels,
                                       feed_dict={
                                           rnn.data: test_data,
                                           rnn.labels: test_labels,
                                           rnn.sentence_lengths: test_sentence_lengths
                                          
                                       })
        matches = np.equal(test_plabels_eval, test_labels)
        num_true_preds += np.sum(matches.astype(float))

        if test_data_reader.batch_id == 0:
          break
      print('Epoch:', train_data_reader.num_epoch)
      print('Accuracy on test data:', num_true_preds * 100. / len(test_data_reader.data))


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels inpu