In [1]:
%load_ext autoreload 
%autoreload 2

import numpy as np
import tensorflow as tf

from lstm_examples import util

np.random.seed(1701)

# Loading data

In [2]:
# Load data from CSV file
n_classes = 4
text, y = util.read_data('data/train.csv')

In [3]:
# Shuffle sentences
shuffle_idx = np.random.permutation(len(text))
text = [text[i] for i in shuffle_idx]
y = y[shuffle_idx]

# Train and test split (just take a thousand for speed)
n_train, n_test = 1000, 500
text_train = text[:n_train]
text_test = text[n_train : n_train+n_test]
y_train = y[:n_train]
y_test = y[n_train : n_train+n_test]

In [4]:
print(text_train[0])
print("\nClass: ", y_train[0])

AFP - How to create sustainable employment for impoverished, mostly illiterate African populations residing primarily in rural areas is the challenge the African Union aims to tackle at a summit opening Wednesday in Burkina Faso.

Class:  0


# Text processing with spaCy

In [5]:
import spacy

nlp = spacy.load('en')

In [6]:
# Parse all text
text_train_parsed = [nlp(s) for s in text_train]
text_test_parsed = [nlp(s) for s in text_test]

In [7]:
# Convert text to integer symbols
symbol_table = util.SymbolTable()

def preprocess_text(parsed_text, symbol_table, init=True):
    mapper = symbol_table.lookup_add if init else symbol_table.lookup
    return [[mapper(w.text.strip().lower()) for s in t.sents for w in s] for t in parsed_text]

In [8]:
symbols_train = preprocess_text(text_train_parsed, symbol_table, True)
symbols_test = preprocess_text(text_test_parsed, symbol_table, False)

In [9]:
print(text_train_parsed[100][:15])
print(symbols_train[100][:15])
print("\nClass: ", y_train[100])

Congress is now poised to hand President Bush an election-year tax cut victory
[1541, 21, 264, 1454, 5, 1542, 150, 882, 555, 1543, 3, 683, 1544, 1277, 1209]

Class:  2


# Converting to matrix format

In [10]:
# Convert symbol lists to a matrix X
# Text has a fixed maximum length (50 tokens)

MAX_SENTENCE_LENGTH = 50

def symbols_to_matrix(symbol_lists, max_sentence_length=MAX_SENTENCE_LENGTH):
    m = len(symbol_lists)
    x_batch_array = np.zeros((m, max_sentence_length)).astype(int)
    len_batch = np.zeros(m).astype(int)
    for j, x in enumerate(symbol_lists):
        t = min(max_sentence_length, len(x))
        x_batch_array[j, :t] = x[:t]
        len_batch[j]         = t
    return x_batch_array, len_batch

In [11]:
symbols_to_matrix(symbols_train[60:62])

(array([[1045, 1046,   77,   79, 1047,  555, 1048,   83,   27, 1049,  202,
         1050,   69, 1051,    3, 1052,    3, 1053,  211, 1054,   28, 1055,
         1056,    9,   22,   27, 1049,   41,   34,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [  22, 1057,   44,   28,  669,    3,   39, 1058, 1059,  186, 1060,
          744,   11,   42,   22, 1061,   44,  436,   21,   28, 1062,   41,
           34,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), array([29, 23]))

# Building the graph

In [12]:
# Inputs
X = tf.placeholder(tf.int32, (None, MAX_SENTENCE_LENGTH))
X_len = tf.placeholder(tf.int32, (None,))
y = tf.placeholder(tf.int32, (None,))

In [13]:
# Create embedding table (one row for each word)
d = 100

# Create zero row for 0 symbol
with tf.variable_scope('embeddings'):
    zero  = tf.constant(0.0, dtype=tf.float32, shape=(1, d))
    embed = tf.Variable(tf.random_normal(
        (symbol_table.num_words() + 1, d), stddev=0.1, seed=1701
    ))
    U = tf.concat([zero, embed], axis=0, name='embedding_matrix')

In [14]:
# Get out rows from U for each word in each sentence
# Dimensions are [batch_size x MAX_SENTENCE_LENGTH x embedding_dimenion]
#                [   ???     x        50           x       100         ]

word_feats = tf.nn.embedding_lookup(U, X)

In [15]:
# Build RNN

batch_size = tf.shape(X)[0]

with tf.variable_scope("LSTM"):
    tf.set_random_seed(1701)
    # LSTM cell architecture
    cell = tf.contrib.rnn.BasicLSTMCell(d)
    # Set RNN
    initial_state = cell.zero_state(batch_size, tf.float32)
    rnn_out, _ = tf.nn.dynamic_rnn(
        cell,
        word_feats,
        sequence_length=X_len,
        initial_state=initial_state,
        time_major=False               
    )

In [17]:
# Linear layer

# The LSTM produced new features for us in rnn_out
# There is one output per word, and we used the same out dimension as the embedding dimension
# Dimensions are [batch_size x MAX_SENTENCE_LENGTH x embedding_dimenion]
#                [   ???     x        50           x       100         ]
# Let's pull out just the last one to describe the whole sentence

def get_rnn_output(output, dim, lengths):
    """Get last output of RNN"""
    batch_size = tf.shape(output)[0]
    max_length = tf.shape(output)[1]
    index = tf.range(0, batch_size) * max_length + (lengths - 1)
    flat = tf.reshape(output, [-1, dim])
    h = tf.gather(flat, index)
    return h

final_feats = get_rnn_output(rnn_out, d, X_len)

with tf.variable_scope("linear"):
    W = tf.Variable(tf.random_normal((d, n_classes), stddev=0.1, seed=1701))
    b = tf.Variable(tf.zeros((n_classes,)))
    h = tf.nn.xw_plus_b(final_feats, W, b)

In [18]:
# Output quantities

# Probabilistic predictions
predictions = tf.nn.softmax(h)
predictions_hard = tf.squeeze(tf.argmax(h, axis=1))

# Loss and SGD
loss = tf.reduce_mean(
    tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h, labels=y)
)
train_op = tf.train.AdamOptimizer(0.0005).minimize(loss)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# Train!

In [19]:
n_epochs = 20
batch_size = 32

session = tf.Session()
session.run(tf.global_variables_initializer())

X_train_all, X_train_len_all = symbols_to_matrix(symbols_train)
X_test_all, X_test_len_all = symbols_to_matrix(symbols_test)

for t in range(n_epochs):
    # Print status
    pred_train = session.run(predictions_hard, {X: X_train_all, X_len: X_train_len_all})
    train_acc = np.mean(pred_train == y_train)
    pred_test = session.run(predictions_hard, {X: X_test_all, X_len: X_test_len_all})
    test_acc = np.mean(pred_test == y_test)
    print("Epoch {0:<4}\tTrain acc={1:.1f}%\t\tTest acc={2:.1f}%".format(t, 100*train_acc, 100*test_acc))
    # Run batches
    for i in range(0, len(symbols_train), batch_size):
        X_batch, X_len_batch = symbols_to_matrix(symbols_train[i : i+batch_size])
        y_batch = y_train[i : i+batch_size]
        loss_batch = session.run([loss, train_op], {X: X_batch, X_len: X_len_batch, y: y_batch})[0]

Epoch 0   	Train acc=23.1%		Test acc=22.0%
Epoch 1   	Train acc=42.5%		Test acc=30.8%
Epoch 2   	Train acc=60.0%		Test acc=45.2%
Epoch 3   	Train acc=65.6%		Test acc=50.8%
Epoch 4   	Train acc=79.9%		Test acc=59.0%
Epoch 5   	Train acc=95.2%		Test acc=63.2%
Epoch 6   	Train acc=81.8%		Test acc=48.8%
Epoch 7   	Train acc=96.6%		Test acc=61.6%
Epoch 8   	Train acc=96.2%		Test acc=60.6%
Epoch 9   	Train acc=98.4%		Test acc=64.8%
Epoch 10  	Train acc=99.1%		Test acc=65.6%
Epoch 11  	Train acc=99.8%		Test acc=66.0%
Epoch 12  	Train acc=99.9%		Test acc=65.0%
Epoch 13  	Train acc=98.7%		Test acc=66.2%
Epoch 14  	Train acc=90.0%		Test acc=61.6%
Epoch 15  	Train acc=99.8%		Test acc=66.8%
Epoch 16  	Train acc=99.8%		Test acc=68.2%
Epoch 17  	Train acc=99.9%		Test acc=68.2%
Epoch 18  	Train acc=99.9%		Test acc=67.8%
Epoch 19  	Train acc=99.9%		Test acc=67.8%
