# Verb Tense Recognition

In [1]:
import os

from keras.layers import Embedding, LSTM, GRU, Conv1D, Dense, Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np

Using TensorFlow backend.


### Load Idiap Tense-Annotation Corpus
We are going to train our models on a parallel corpus from https://www.idiap.ch/dataset/tense-annotation

In [2]:
def get_verb_info(verb_lines):
    verbs_info = dict()
    for verb_ln in verb_lines:
        verb_data = verb_ln.rstrip('\t').split('\t')
        verb_positions = verb_data[0].split(" ")
        verb_tense = verb_data[2]
        if verb_tense in {'cond other', 'pres other'}:  # we don't want these rare & strange tenses
            verb_tense = 'other'
        for verb_position in verb_positions:
            verbs_info[int(verb_position)] = verb_tense
    return verbs_info

def get_tagged_tokens(sentence, verbs_information):
    tagged_tokens = []
    for token_index, token in enumerate(sentence.split(" "), 1):
        tag = verbs_information[token_index] if token_index in verbs_information else 'O'
        tagged_tokens.append((token, tag))
    return tagged_tokens

def tagged_sentence_generator(ds_path, start_sent_nb=0, end_sent_nb_excl=None):
    counter = 0
    with open(ds_path, 'r') as f:
        sent_data = []
        for line in f.readlines():
            if line == '\n' and sent_data:
                english_sent, french_sent = sent_data[0], sent_data[1]
                verbs_info_dict = get_verb_info(sent_data[2:])
                tagged_sentence = get_tagged_tokens(english_sent, verbs_info_dict)
                if start_sent_nb <= counter:
                    if end_sent_nb_excl is None or counter < end_sent_nb_excl:
                        yield tagged_sentence
                    else:
                        break
                counter += 1
                sent_data = []
            else:
                sent_data.append(line.rstrip('\n'))

def get_all_ds_tags_and_len(ds_path):
    ds_tags = set()
    counter = 0
    with open(ds_path, 'r') as f:
        sent_data = []
        for line in f.readlines():
            if line == '\n' and sent_data:
                verbs_info_dict = get_verb_info(sent_data[2:])
                ds_tags.update(verbs_info_dict.values())
                counter += 1
                sent_data = []
            else:
                sent_data.append(line.rstrip('\n'))
    return ds_tags, counter

In [3]:
PATH = "./CorpusAnnotatedTenseVoice.txt"
SENT_NB = 30            
sent_generator = tagged_sentence_generator(PATH, SENT_NB)
print("Sentence {} example: ".format(SENT_NB), next(sent_generator))
sent_generator = tagged_sentence_generator(PATH)
max_sent_len = 0
for sent in sent_generator:
    max_sent_len = max(max_sent_len, len(sent))
print("Max. sentence length: {} tokens".format(max_sent_len))
ds_tags_set, ds_size = get_all_ds_tags_and_len(PATH)
print("Number of different tenses: {}, Nb. of sentences in the corpus: {}".format(len(ds_tags_set), ds_size))
ds_tags_set.add('O')  # a default tag for all non-verbs

Sentence 30 example:  [('Yes', 'O'), (',', 'O'), ('Mrs', 'O'), ('Schroedter', 'O'), (',', 'O'), ('I', 'O'), ('shall', 'fut'), ('be', 'fut'), ('pleased', 'fut'), ('to', 'O'), ('look', 'infinitif'), ('into', 'O'), ('the', 'O'), ('facts', 'O'), ('of', 'O'), ('this', 'O'), ('case', 'O'), ('when', 'O'), ('I', 'O'), ('have', 'pres_perf'), ('received', 'pres_perf'), ('your', 'O'), ('letter', 'O'), ('.', 'O')]
Max. sentence length: 858 tokens
Number of different tenses: 19, Nb. of sentences in the corpus: 411319


## Word Level Models

We are going to use pre-trained "GloVe" word embeddings that can be downloaded from https://nlp.stanford.edu/data/glove.6B.zip

Note: Words will be kept with their original capitalization. We might want to try to make all words lower-case, too. "word.lower()"

In [4]:
# create a vocabulary of all words in our dataset
words = set([])
[[words.add(token) for token, _ in sent] for sent in tagged_sentence_generator(PATH)]
print("vocabulary size: {}".format(len(words)))
print("does" in words, "Does" in words)

vocabulary size: 61760
True True


In [5]:
# create a dictionary, an index for each word
dictionary = dict()
for i, word in enumerate(words):
    dictionary[word] = i
print("index of `hello`: {}".format(dictionary["hello"]))

# a mapping for indexes back into words
idx2word = {}
for word, i in dictionary.items():
    idx2word[i] = word

index of `hello`: 32667


In [6]:
# convert tags to numerical labels, create a dictionary, an index for each token
tag2lab = dict()
for i, tag in enumerate(ds_tags_set):
    tag2lab[tag] = i
# a mapping for labels back into tags
lab2tag = {}
for tag, i in tag2lab.items():
    lab2tag[i] = tag
lab2tag

{0: 'fut_perf',
 1: 'other',
 2: 'past_perf_cont',
 3: 'past_perf',
 4: 'cond_cont',
 5: 'sim_past other',
 6: 'sim_past',
 7: 'cond_perf_cont',
 8: 'pres_perf',
 9: 'past_cont',
 10: 'pres',
 11: 'pres_cont',
 12: 'fut',
 13: 'pres_perf_cont',
 14: 'cond',
 15: 'infinitif',
 16: 'cond_perf',
 17: 'O',
 18: 'fut_perf_cont',
 19: 'fut_cont'}

In [7]:
# let's create a dictionary of embeddings from each word embedding vector in the pre-trained GloVe embeddings file
GLOVE_DIR = "."
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print("embedding for the word `word`:")
print(embeddings_index.get("word"))

Found 400000 word vectors.
embedding for the word `word`:
[-0.1643     0.15722   -0.55021   -0.3303     0.66463   -0.1152
 -0.2261    -0.23674   -0.86119    0.24319    0.074499   0.61081
  0.73683   -0.35224    0.61346    0.0050975 -0.62538   -0.0050458
  0.18392   -0.12214   -0.65973   -0.30673    0.35038    0.75805
  1.0183    -1.7424    -1.4277     0.38032    0.37713   -0.74941
  2.9401    -0.8097    -0.66901    0.23123   -0.073194  -0.13624
  0.24424   -1.0129    -0.24919   -0.06893    0.70231   -0.022177
 -0.64684    0.59599    0.027092   0.11203    0.61214    0.74339
  0.23572   -0.1369   ]


In [8]:
# let's try to extract the GloVe embeddings for each word from our dataset vocabulary
EMBEDDING_DIM = 50
embedding_matrix = np.zeros((len(dictionary) + 1, EMBEDDING_DIM))
for word, i in dictionary.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
del embeddings_index

Checking how many words have no pre-trained GloVe word embeddings:

In [9]:
oov_percentage = 100. * np.count_nonzero(np.all(embedding_matrix == 0, axis=1)) / len(dictionary)  # OOV portion
print("percentage of words out of vocabulary: %s percent" % oov_percentage)
outta_vocab_idxs = set(np.where(np.all(embedding_matrix == 0, axis=1))[0])
outta_vocab_words = [word for word, i in dictionary.items() if i in outta_vocab_idxs]
print("examples of words without pre-trained GloVe embeddings:")
print(outta_vocab_words[:15])

percentage of words out of vocabulary: 43.78400259067357 percent
examples of words without pre-trained GloVe embeddings:
['', 'quick-acting', 'B5-0160', '5178', 'specifically-channelled', 'service-level', 'McDuck', 'Moher', 'Ã“', 'V', 'Head', 'A5-0400', 'fish-catching', 'B5-0846', 'Al-Sabah']


In [10]:
def prepare_batch(x_batch, y_batch, max_sentence_len, nb_labels, tok_dict, tag2label):
    # pad sequences with zeros to make them same length: we need it for vectorized computations
    x_batch = pad_sequences(x_batch, maxlen=max_sentence_len, padding='post', value=tok_dict[''])            
    # convert labels to categorical one-hot vectors
    y_batch = pad_sequences(y_batch, maxlen=max_sentence_len, padding='post', value=tag2label['O'])
    y_batch = to_categorical(y_batch, nb_labels)
    return np.array(x_batch), np.array(y_batch, dtype="float64")

def batch_generator(
        start_idx, end_idx_excl,
        tok_dict, tag2label, nb_labels, ds_path,
        max_sentence_len, batch_size=32
):
    """infinitely yield batches of sequences and labels"""
    x_batch, y_batch = [], []
    while True:
        sentence_generator = tagged_sentence_generator(ds_path, start_idx, end_idx_excl)
        for sentence in sentence_generator:
            toks, tags = zip(*sentence[:max_sent_len])  # !!! we cut off too long sentences !!!
            # convert sentence into sequences of word indexes
            sequence = [tok_dict[tok] for tok in toks]  # TODO: we might want to try lower-cased tokens
            labels = [tag2label[tag] for tag in tags]
            if len(x_batch) == batch_size:
                yield prepare_batch(x_batch, y_batch, max_sentence_len, nb_labels, tok_dict, tag2label)
                x_batch, y_batch = [], []
            x_batch.append(sequence)
            y_batch.append(labels)
        if len(x_batch) != 0:
            yield prepare_batch(x_batch, y_batch, max_sentence_len, nb_labels, tok_dict, tag2label)

In [11]:
MAX_SEQUENCE_LENGTH = 200  # cutting off too long sentences
BATCH_SIZE = 32

# our dataset will be split into a traing part and a validation part,
# where we measure our model's performance during training

# we will further keep a testing part to evaluate predictions 

# TEST_SPLIT = .1
# nb_test_samples = int(TEST_SPLIT * ds_size)
# print("number of validation and test samples: %s" % nb_test_samples)

# train_generator = batch_generator(
#     0, ds_size-2*nb_test_samples, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
# )
# val_generator = batch_generator(
#     ds_size-2*nb_test_samples, ds_size-nb_test_samples, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
# )
# val_generator = batch_generator(
#     ds_size-nb_test_samples, ds_size, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
# )

### RNN Model
Embeddings layer will be using the weights from the pre-trained GloVe vectors. We don't want to change them so we set `trainable=False`,

we add a Bidirectional layer of LSTM cells or GRU cells after the Embedding layer,
if we set `return_sequences` True, we will get the output of the cells in each timestep of the sequence, that's what we want :)

you can change the complexity of the model by setting `HIDDEN_SIZE_LSTM` which changes the number of `units`.


In [12]:
train_generator = batch_generator(
    0, 50000, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
)
val_generator = batch_generator(
    50000, 55000, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
)
test_generator = batch_generator(
    55000, 60000, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
)

NB_EPOCHS = 1
HIDDEN_SIZE_LSTM = 50
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Bidirectional(LSTM(HIDDEN_SIZE_LSTM, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dense(len(ds_tags_set), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(train_generator, validation_data=val_generator,
                    epochs=NB_EPOCHS, steps_per_epoch=50000, validation_steps=5000)
model.save('rnn_model.h5') 
score, acc = model.evaluate_generator(test_generator, steps=5000, callbacks=None)
print('Test accuracy:', acc)

Epoch 1/1
Test accuracy: 0.9966673851013184


In [13]:
def print_predictions(x_test, y_pred, idx2word, lab2tag):
    """
    print the results of our model's predictions after converting them back to tokens and tags
    """
    pad_symbol = ''
    for seq, preds in zip(x_test, y_pred):
        sentence = []
        pad_removed = False
        for i in range(len(seq)-1, -1, -1):
            word_id, pred = seq[i], preds[i]
            word, tag = idx2word[word_id], lab2tag[np.argmax(pred)]
            if word == pad_symbol and not pad_removed:
                continue
            else:
                pad_removed = True
                sentence.append((word, tag))
        print(list(reversed(sentence)))
        print('\n')

In [14]:
# let's see what our model predicts
x_test, _ = next(test_generator)
y_pred = model.predict(x_test)
print_predictions(x_test[:4], y_pred[:4], idx2word, lab2tag)

[('-', 'O'), ('(', 'O'), ('SV', 'O'), (')', 'O'), ('Finland', 'O'), ('has', 'pres_perf'), ('been', 'pres_perf'), ('hit', 'pres_perf'), ('by', 'O'), ('serious', 'O'), ('problems', 'O'), ('in', 'O'), ('the', 'O'), ('form', 'O'), ('of', 'O'), ('a', 'O'), ('drastic', 'O'), ('increase', 'O'), ('in', 'O'), ('the', 'O'), ('amount', 'O'), ('of', 'O'), ('beer', 'O'), ('imported', 'other'), ('from', 'O'), ('third', 'O'), ('countries', 'O'), (',', 'O'), ('mainly', 'O'), ('from', 'O'), ('Estonia', 'O'), ('and', 'O'), ('Russia', 'O'), ('.', 'O')]


[('It', 'O'), ('is', 'pres'), ('therefore', 'O'), ('a', 'O'), ('very', 'O'), ('positive', 'O'), ('development', 'O'), ('that', 'O'), ('the', 'O'), ('Commission', 'O'), ('is', 'pres_cont'), ('now', 'O'), ('granting', 'pres_cont'), ('Finland', 'O'), ('a', 'O'), ('six-year', 'O'), ('derogation', 'O'), ('to', 'O'), ('enable', 'infinitif'), ('it', 'O'), ('to', 'O'), ('introduce', 'infinitif'), ('a', 'O'), ('new', 'O'), ('limit', 'O'), ('on', 'O'), ('beer', 'O

### CNN Model

Use Conv1d instead of RNN layers:

`Conv1D(filters=EMBEDDING_DIM, kernel_size=..., activation='relu', padding=...)`

We need to preserve the sequence legth, when going from one layer to another, so we set padding='same'

kernel_size (window size) is a parameter setting the scope of view for our convolutional filter, how many words we look at.

We want a filter for each index of our word embedding vector

Try setting padding='causal'. This will make our window (kernel) wider, but we will look only at every other word in a sequence inside the window. This is also called a dilated convolution.




In [15]:
train_generator = batch_generator(
    0, 50000, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
)
val_generator = batch_generator(
    50000, 55000, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
)
test_generator = batch_generator(
    55000, 60000, dictionary, tag2lab, len(ds_tags_set), PATH, MAX_SEQUENCE_LENGTH, BATCH_SIZE
)

NB_EPOCHS = 1
WINDOW_SIZES = [5, 5]
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Conv1D(filters=EMBEDDING_DIM, kernel_size=WINDOW_SIZES[0], activation='relu', padding="causal"))
model.add(Conv1D(filters=EMBEDDING_DIM, kernel_size=WINDOW_SIZES[1], activation='relu', padding="same"))
model.add(Dense(len(ds_tags_set), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(train_generator, validation_data=val_generator,
                    epochs=NB_EPOCHS, steps_per_epoch=50000, validation_steps=5000)
model.save('cnn_model.h5') 
score, acc = model.evaluate_generator(test_generator, steps=5000, callbacks=None)
print('Test accuracy:', acc)

Epoch 1/1
Test accuracy: 0.9954240322113037


In [16]:
# let's see what our model predicts
x_test, _ = next(test_generator)
y_pred = model.predict(x_test)
print_predictions(x_test[:4], y_pred[:4], idx2word, lab2tag)

[('-', 'O'), ('(', 'O'), ('SV', 'O'), (')', 'O'), ('Finland', 'O'), ('has', 'pres_perf'), ('been', 'pres_perf'), ('hit', 'pres_perf'), ('by', 'O'), ('serious', 'O'), ('problems', 'O'), ('in', 'O'), ('the', 'O'), ('form', 'O'), ('of', 'O'), ('a', 'O'), ('drastic', 'O'), ('increase', 'O'), ('in', 'O'), ('the', 'O'), ('amount', 'O'), ('of', 'O'), ('beer', 'O'), ('imported', 'O'), ('from', 'O'), ('third', 'O'), ('countries', 'O'), (',', 'O'), ('mainly', 'O'), ('from', 'O'), ('Estonia', 'O'), ('and', 'O'), ('Russia', 'O'), ('.', 'O')]


[('It', 'O'), ('is', 'pres'), ('therefore', 'O'), ('a', 'O'), ('very', 'O'), ('positive', 'O'), ('development', 'O'), ('that', 'O'), ('the', 'O'), ('Commission', 'O'), ('is', 'pres_cont'), ('now', 'O'), ('granting', 'pres_cont'), ('Finland', 'O'), ('a', 'O'), ('six-year', 'O'), ('derogation', 'O'), ('to', 'O'), ('enable', 'other'), ('it', 'O'), ('to', 'O'), ('introduce', 'infinitif'), ('a', 'O'), ('new', 'O'), ('limit', 'O'), ('on', 'O'), ('beer', 'O'), ('im