In [None]:
%tensorflow_version 1.x
import tensorflow as tf

from collections import defaultdict


from os import listdir
from os.path import isfile
import re
import numpy as np
import random

TensorFlow 1.x selected.


In [None]:
MAX_DOC_LENGTH = 500
NUM_CLASSES = 20
unknown_ID = 1
padding_ID = 0

#Build data files

In [None]:
def gen_data_and_vocab():
  def colect_data_from(parent_path,newsgroup_list,word_count = None):
    data = []
    for group_id, newsgroup in enumerate(newsgroup_list):
        label = group_id
        dir_path = parent_path + '/' + newsgroup + '/'
        files = [(filename, dir_path + filename) for filename in listdir(dir_path) if isfile(dir_path + filename)]
        files.sort()
        print('Processing: {}-{}'.format(group_id,newsgroup))
        for filename, filepath in files:
            with open(filepath,encoding = 'ISO-8859-1') as f:
                text = f.read().lower()
                words = re.split('\W+',text)
                if word_count is not None:
                  for word in words:
                    word_count[word] += 1
                content = ' '.join(words)
                assert len(content.splitlines()) == 1
                data.append(str(label) + '<fff>' + filename + '<fff>' + content)
    return data
  word_count = defaultdict(int)

  path = '../datasets/20news-bydate/'
  dirs = [path + dir_name  for dir_name in listdir(path) if not isfile(path + dir_name)]
  train_path, test_path = (dirs[0], dirs[1]) if 'train' in dirs[0] else (dirs[1], dirs[0])
  newsgroup_list = [newsgroup for newsgroup in listdir(train_path)]
  newsgroup_list.sort()

  train_data = colect_data_from(parent_path = train_path,newsgroup_list = newsgroup_list, word_count = word_count )
  vocab = [word for word, freq in zip(word_count.keys(),word_count.values()) if freq > 10]
  vocab.sort()
  with open('../datasets/20news-bydate/w2v/vocab-raw.txt','w') as f:
    f.write('\n'.join(vocab))
  
  test_data= colect_data_from(parent_path = test_path,newsgroup_list = newsgroup_list)
  with open('../datasets/20news-bydate/w2v/20news-train-raw.txt','w') as f:
    f.write('\n'.join(train_data))

  with open('../datasets/20news-bydate/w2v/20news-test-raw.txt','w') as f:
    f.write('\n'.join(train_data))
  
gen_data_and_vocab()                                 

In [None]:
def encode_data(data_path,vocab_path):
  with open(vocab_path) as f:
    vocab = dict([( word,word_ID + 2 ) for word_ID,word in enumerate(f.read().splitlines())])
  with open(data_path) as f:
    documents = [(line.split('<fff>')[0], line.split('<fff>')[1], line.split('<fff>')[2]) for line in f.read().splitlines()]
  encoded_data = []
  for document in documents:
    label,doc_id,text = document
    words = text.split()[:MAX_DOC_LENGTH]
    sentence_length = len(words)
    encoded_text = []
    for word in words:
      if word in vocab:
        encoded_text.append(str(vocab[word]))
      else:
        encoded_text.append(str(unknown_ID))
    if len(words) < MAX_DOC_LENGTH:
      num_padding = MAX_DOC_LENGTH - len(words)
      for _ in range(num_padding):
        encoded_text.append(str(padding_ID))
    encoded_data.append(str(label) + '<fff>' + str(doc_id) + '<fff>' + str(sentence_length) + '<fff>' + ' '.join(encoded_text))


  dir_name = '/'.join(data_path.split('/')[:-1])
  
  file_name = '-'.join(data_path.split('/')[-1].split('-')[:-1])+'-encoded.txt'
  with open(dir_name+ '/' + file_name , 'w') as f:
    f.write('\n'.join(encoded_data))
  

In [None]:
encode_data('../datasets/20news-bydate/w2v/20news-train-raw.txt',
            '../datasets/20news-bydate/w2v/vocab-raw.txt')
encode_data('../datasets/20news-bydate/w2v/20news-test-raw.txt',
            '../datasets/20news-bydate/w2v/vocab-raw.txt')

#RNN class

In [None]:
class RNN:
  def __init__(self,vocab_size,embedding_size, lstm_size,batch_size):
    self._vocab_size = vocab_size
    self._embedding_size = embedding_size
    self._lstm_size = lstm_size
    self._batch_size = batch_size

    self._data = tf.placeholder(tf.int32,shape = [batch_size,MAX_DOC_LENGTH])
    self._labels = tf.placeholder(tf.int32,shape = [batch_size,])
    self._sentence_lengths = tf.placeholder(tf.int32,shape = [batch_size,])
    self._final_tokens = tf.placeholder(tf.int32,shape =  [batch_size,])

  def embedding_layer(self,indices):
    pretrained_vectors = []
    pretrained_vectors.append(np.zeros(self._embedding_size))
    np.random.seed(2021)

    for _ in range(self._vocab_size + 1):
      pretrained_vectors.append(np.random.normal(loc = 0., scale = 1., size = self._embedding_size))

    pretrained_vectors = np.array(pretrained_vectors)

    with tf.variable_scope('embedding', reuse = tf.AUTO_REUSE):
      self._embedding_matrix = tf.get_variable(
            name = 'embedding',
            shape = (self._vocab_size + 2, self._embedding_size),
            initializer = tf.constant_initializer(pretrained_vectors)
      )

    return tf.nn.embedding_lookup(self._embedding_matrix, indices)

  def LSTM_layer(self,embeddings):
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(self._lstm_size)
    zero_state = tf.zeros(shape = (self._batch_size, self._lstm_size))
    initial_state = tf.contrib.rnn.LSTMStateTuple(zero_state, zero_state)

    lstm_inputs = tf.unstack(tf.transpose(embeddings, perm = [1, 0, 2]))
    with tf.variable_scope('lstm', reuse = tf.AUTO_REUSE):
      lstm_outputs, last_state = tf.nn.static_rnn (
            cell = lstm_cell,
            inputs = lstm_inputs,
            initial_state = initial_state,
            sequence_length = self._sentence_lengths
      )

    lstm_outputs = tf.unstack(tf.transpose(lstm_outputs, perm = [1, 0, 2]))
    lstm_outputs = tf.concat(lstm_outputs, axis = 0)

    mask = tf.sequence_mask(
        lengths = self._sentence_lengths,
        maxlen = MAX_DOC_LENGTH,
        dtype = tf.float32
    )

    mask = tf.concat(tf.unstack(mask, axis = 0), axis = 0)
    mask = tf.expand_dims(mask, -1)

    lstm_outputs = mask * lstm_outputs
    lstm_outputs_split = tf.split(lstm_outputs, num_or_size_splits = self._batch_size)
    lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis = 1)
    lstm_outputs_average = lstm_outputs_sum / tf.expand_dims(tf.cast(self._sentence_lengths, tf.float32), -1)

    return lstm_outputs_average

  def build_graph(self):
    embeddings = self.embedding_layer(self._data)
    lstm_outputs = self.LSTM_layer(embeddings)
    with tf.variable_scope('final_layer_weights', reuse = tf.AUTO_REUSE):
      weights = tf.get_variable(
          name = 'final_layer_weights',
          shape = (self._lstm_size, NUM_CLASSES),
          initializer = tf.random_normal_initializer(seed = 2021)
      )
    with tf.variable_scope('final_layer_biases', reuse = tf.AUTO_REUSE):
      biases = tf.get_variable(
            name = 'final_layer_biases',
            shape = (NUM_CLASSES),
            initializer = tf.random_normal_initializer(seed = 2021)
      )
    logits = tf.matmul(lstm_outputs, weights) + biases

    labels_one_hot = tf.one_hot(
        indices = self._labels,
        depth = NUM_CLASSES,
        dtype = tf.float32
    )

    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels = labels_one_hot,
        logits = logits
    )

    loss = tf.reduce_mean(loss)

    probs = tf.nn.softmax(logits)
    predicted_labels = tf.argmax(probs, axis = 1)
    predicted_labels = tf.squeeze(predicted_labels)

    return predicted_labels, loss
  def trainer(self,loss,learning_rate):
    with tf.variable_scope('optimizer', reuse = tf.AUTO_REUSE):
      train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
      return train_op
  

In [None]:

with open('../datasets/20news-bydate/w2v/vocab-raw.txt',encoding = 'ISO-8859-1') as f:
  vocab_size = len(f.read().splitlines())

tf.random.set_random_seed(2021)
rnn = RNN(
  vocab_size = vocab_size,
  embedding_size = 300,
  lstm_size = 50,
  batch_size = 50
)

predicted_labels , loss = rnn.build_graph()
train_op = rnn.trainer(loss = loss, learning_rate = 0.01)


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [None]:
class DataReader:
  def __init__(self,data_path,batch_size):
    self._batch_size = batch_size
    with open(data_path) as f:
      d_lines = f.read().splitlines()
    self._data = []
    self._labels = []
    self._sentence_lengths = []
    self._final_tokens = []
    for data_id, line in enumerate(d_lines):
      
      features = line.split('<fff>')

      label, doc_id,length = int(features[0]), int(features[1]), int(features[2])

      tokens = [int(tmp) for tmp in  features[3].split()]
      
      self._data.append(tokens)
      self._labels.append(label)
      self._sentence_lengths.append(length)
      self._final_tokens.append(tokens[length - 1])
    self._data = np.array(self._data)
    self._labels = np.array(self._labels)
    self._sentence_lengths = np.array(self._sentence_lengths)
    self._final_tokens = np.array(self._final_tokens)
    
    self._num_epoch = 0
    self._current_part = 0
  def next_batch(self):
    start = self._current_part * self._batch_size
    end = start + self._batch_size
    self._current_part += 1

    if (end + self._batch_size > len(self._data)):
      
      self._num_epoch  += 1
      self._current_part = 0
      indices = range(len(self._data))
      random.seed(2021)
      random.shuffle(list(indices))
      tmpdata = []
      tmpy = []
      tmplen = []
      tmpft = []
      tmpdata = self._data[start:end]
      tmpy = self._labels[start:end]
      tmplen = self._sentence_lengths[start:end] 
      tmpft = self._final_tokens[start:end]
      self._data, self._labels,self._sentence_lengths, self._final_tokens = self._data[indices], self._labels[indices], self._sentence_lengths[indices],self._final_tokens[indices]
      return tmpdata,tmpy,tmplen,tmpft
    return  self._data[start:end], self._labels[start:end],self._sentence_lengths[start:end], self._final_tokens[start:end]

In [None]:
with tf.Session() as sess:
  train_data_reader = DataReader(
      data_path = '../datasets/20news-bydate/w2v/20news-train-encoded.txt',
      batch_size = 50
  )
  test_data_reader = DataReader(
      data_path = '../datasets/20news-bydate/w2v/20news-test-encoded.txt',
      batch_size = 50
  )
  step = 0
  MAX_STEP = 10000

  sess.run(tf.global_variables_initializer())
  
  while step < MAX_STEP:

    next_train_batch =  train_data_reader.next_batch()
    
    train_data, train_labels, train_sentence_lengths, train_final_token = next_train_batch
    plabels_eval, loss_eval,_ = sess.run(
        [predicted_labels, loss, train_op],
        feed_dict = {
            rnn._data : train_data,
            rnn._labels: train_labels,
            rnn._sentence_lengths:train_sentence_lengths,
            rnn._final_tokens : train_final_token

        }
    )
    
    step += 1
    if step % 20 == 0:
      print('step: ' + str(step) +' - loss: ', str(loss_eval))

    if train_data_reader._current_part == 0:
      num_true_preds = 0
      while True:
        next_test_batch = test_data_reader.next_batch()
        test_data, test_labels, test_sentence_lenghts, test_final_tokens = next_test_batch

        test_plabels_eval = sess.run(
            predicted_labels,
            feed_dict = {
                rnn._data: test_data,
                rnn._labels: test_labels,
                rnn._sentence_lengths: test_sentence_lenghts,
                rnn._final_tokens: test_final_tokens
            }
        )
        matches = np.equal(test_plabels_eval, test_labels)
        num_true_preds += np.sum(matches.astype(float))

        if test_data_reader._current_part == 0:
          break
      
      print('Epoch: ', train_data_reader._num_epoch)
      print('Accuracy on test data: ', num_true_preds * 100. / len(test_data_reader._data))

step: 20 - loss:  0.0012361495
step: 40 - loss:  0.49003345
step: 60 - loss:  5.757242
step: 80 - loss:  0.86787415
step: 100 - loss:  3.892943
step: 120 - loss:  5.416399
step: 140 - loss:  3.0256412
step: 160 - loss:  3.6123781
step: 180 - loss:  3.6552386
step: 200 - loss:  9.024841
step: 220 - loss:  3.508162
Epoch:  1
Accuracy on test data:  5.639031288668906
step: 240 - loss:  3.8007042
step: 260 - loss:  2.9527574
step: 280 - loss:  2.6138244
step: 300 - loss:  2.9545527
step: 320 - loss:  3.2980201
step: 340 - loss:  1.8885423
step: 360 - loss:  3.9621894
step: 380 - loss:  2.4411442
step: 400 - loss:  1.3076682
step: 420 - loss:  2.404712
step: 440 - loss:  2.4470227
Epoch:  2
Accuracy on test data:  14.954923104118791
step: 460 - loss:  2.2466848
step: 480 - loss:  2.2890556
step: 500 - loss:  2.504161
step: 520 - loss:  1.3569804
step: 540 - loss:  2.1602623
step: 560 - loss:  2.0577395
step: 580 - loss:  1.4307588
step: 600 - loss:  2.4023309
step: 620 - loss:  2.932227
ste