In [None]:
%load_ext autoreload 
%autoreload 2

import numpy as np

from lstm_examples import util

np.random.seed(1701)

# Loading data

In [None]:
# Load data from CSV file
n_classes = 4
text, y = util.read_data('data/train.csv')

In [None]:
# Shuffle sentences
shuffle_idx = np.random.permutation(len(text))
text = [text[i] for i in shuffle_idx]
y = y[shuffle_idx]

# Train and test split (just take a thousand for speed)
n_train, n_test = 1000, 500
text_train = text[:n_train]
text_test = text[n_train : n_train+n_test]
y_train = y[:n_train]
y_test = y[n_train : n_train+n_test]

In [None]:
print(text_train[0])
print("\nClass: ", y_train[0])

# Text processing with spaCy

In [None]:
import spacy

nlp = spacy.load('en')

In [None]:
# Parse all text
text_train_parsed = [nlp(s) for s in text_train]
text_test_parsed = [nlp(s) for s in text_test]

In [None]:
# Convert text to integer symbols
symbol_table = util.SymbolTable()

def preprocess_text(parsed_text, symbol_table, init=True):
    mapper = symbol_table.lookup_add if init else symbol_table.lookup
    return [[mapper(w.text.strip().lower()) for s in t.sents for w in s] for t in parsed_text]

In [None]:
symbols_train = preprocess_text(text_train_parsed, symbol_table, True)
symbols_test = preprocess_text(text_test_parsed, symbol_table, False)

In [None]:
print(text_train_parsed[100][:15])
print(symbols_train[100][:15])
print("\nClass: ", y_train[100])

# Converting to matrix format

In [None]:
from keras.preprocessing import sequence

MAX_LENGTH = 50

x_train = sequence.pad_sequences(symbols_train, maxlen=MAX_LENGTH)
x_test = sequence.pad_sequences(symbols_test, maxlen=MAX_LENGTH)

# Building the graph

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import Adam

d = 100

model = Sequential()
model.add(Embedding(symbol_table.num_symbols(), output_dim=d))
model.add(LSTM(d))
model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['accuracy'])

# Train!

In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs=20)