In [None]:
%load_ext autoreload 
%autoreload 2

import numpy as np
import tensorflow as tf

from lstm_examples import util

np.random.seed(1701)

# Loading data

In [None]:
# Load data from CSV file
n_classes = 4
text, y = util.read_data('data/train.csv')

In [None]:
# Shuffle sentences
shuffle_idx = np.random.permutation(len(text))
text = [text[i] for i in shuffle_idx]
y = y[shuffle_idx]

# Train and test split (just take a thousand for speed)
n_train, n_test = 1000, 500
text_train = text[:n_train]
text_test = text[n_train : n_train+n_test]
y_train = y[:n_train]
y_test = y[n_train : n_train+n_test]

In [None]:
print(text_train[0])
print("\nClass: ", y_train[0])

# Text processing with spaCy

In [None]:
import spacy

nlp = spacy.load('en')

In [None]:
# Parse all text
text_train_parsed = [nlp(s) for s in text_train]
text_test_parsed = [nlp(s) for s in text_test]

In [None]:
# Convert text to integer symbols
symbol_table = util.SymbolTable()

def preprocess_text(parsed_text, symbol_table, init=True):
    mapper = symbol_table.lookup_add if init else symbol_table.lookup
    return [[mapper(w.text.strip().lower()) for s in t.sents for w in s] for t in parsed_text]

In [None]:
symbols_train = preprocess_text(text_train_parsed, symbol_table, True)
symbols_test = preprocess_text(text_test_parsed, symbol_table, False)

In [None]:
print(text_train_parsed[100][:15])
print(symbols_train[100][:15])
print("\nClass: ", y_train[100])

# Converting to matrix format

In [None]:
# Convert symbol lists to a matrix X
# Text has a fixed maximum length (50 tokens)

MAX_SENTENCE_LENGTH = 50

def symbols_to_matrix(symbol_lists, max_sentence_length=MAX_SENTENCE_LENGTH):
    m = len(symbol_lists)
    x_batch_array = np.zeros((m, max_sentence_length)).astype(int)
    len_batch = np.zeros(m).astype(int)
    for j, x in enumerate(symbol_lists):
        t = min(max_sentence_length, len(x))
        x_batch_array[j, :t] = x[:t]
        len_batch[j]         = t
    return x_batch_array, len_batch

In [None]:
symbols_to_matrix(symbols_train[60:62])

# Building the graph

In [None]:
# Inputs
X = tf.placeholder(tf.int32, (None, MAX_SENTENCE_LENGTH))
X_len = tf.placeholder(tf.int32, (None,))
y = tf.placeholder(tf.int32, (None,))

In [None]:
# Create embedding table (one row for each word)
d = 100

# Create zero row for 0 symbol
with tf.variable_scope('embeddings'):
    zero  = tf.constant(0.0, dtype=tf.float32, shape=(1, d))
    embed = tf.Variable(tf.random_normal(
        (symbol_table.num_words() + 1, d), stddev=0.1, seed=1701
    ))
    U = tf.concat([zero, embed], axis=0, name='embedding_matrix')

In [None]:
# Get out rows from U for each word in each sentence
# Dimensions are [batch_size x MAX_SENTENCE_LENGTH x embedding_dimenion]
#                [   ???     x        50           x       100         ]

word_feats = tf.nn.embedding_lookup(U, X)

In [None]:
# Build RNN

batch_size = tf.shape(X)[0]

with tf.variable_scope("LSTM") as scope:
    tf.set_random_seed(1701)
    # LSTM cell architecture
    cell = tf.contrib.rnn.BasicLSTMCell(d)
    # Set RNN
    initial_state = cell.zero_state(batch_size, tf.float32)
    rnn_out, _ = tf.nn.dynamic_rnn(
        cell,
        word_feats,
        sequence_length=X_len,
        initial_state=initial_state,
        time_major=False               
    )

In [None]:
# Linear layer

# The LSTM produced new features for us in rnn_out
# There is one output per word, and we used the same out dimension as the embedding dimension
# Dimensions are [batch_size x MAX_SENTENCE_LENGTH x embedding_dimenion]
#                [   ???     x        50           x       100         ]
# Let's pull out just the last one to describe the whole sentence

def get_rnn_output(output, dim, lengths):
    """Get last output of RNN"""
    batch_size = tf.shape(output)[0]
    max_length = tf.shape(output)[1]
    index = tf.range(0, batch_size) * max_length + (lengths - 1)
    flat = tf.reshape(output, [-1, dim])
    h = tf.gather(flat, index)
    return h

final_feats = get_rnn_output(rnn_out, d, X_len)

with tf.variable_scope("linear"):
    W = tf.Variable(tf.random_normal((d, n_classes), stddev=0.1, seed=1701))
    b = tf.Variable(tf.zeros((n_classes,)))
    h = tf.nn.xw_plus_b(final_feats, W, b)

In [None]:
# Output quantities

# Probabilistic predictions
predictions = tf.nn.softmax(h)
predictions_hard = tf.squeeze(tf.argmax(h, axis=1))

# Loss and SGD
loss = tf.reduce_mean(
    tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h, labels=y)
)
train_op = tf.train.AdamOptimizer(0.0005).minimize(loss)

# Train!

In [None]:
n_epochs = 20
batch_size = 32

session = tf.Session()
session.run(tf.global_variables_initializer())

X_train_all, X_train_len_all = symbols_to_matrix(symbols_train)
X_test_all, X_test_len_all = symbols_to_matrix(symbols_test)

for t in range(n_epochs):
    # Print status
    pred_train = session.run(predictions_hard, {X: X_train_all, X_len: X_train_len_all})
    train_acc = np.mean(pred_train == y_train)
    pred_test = session.run(predictions_hard, {X: X_test_all, X_len: X_test_len_all})
    test_acc = np.mean(pred_test == y_test)
    print("Epoch {0:<4}\tTrain acc={1:.1f}%\t\tTest acc={2:.1f}%".format(t, 100*train_acc, 100*test_acc))
    # Run batches
    for i in range(0, len(symbols_train), batch_size):
        X_batch, X_len_batch = symbols_to_matrix(symbols_train[i : i+batch_size])
        y_batch = y_train[i : i+batch_size]
        loss_batch = session.run([loss, train_op], {X: X_batch, X_len: X_len_batch, y: y_batch})[0]