In [53]:
!pip install -qU gluonnlp awscli botocore boto3 nltk sacremoses --upgrade

[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [57]:
import io
import random
import numpy as np
import mxnet as mx
from mxnet import gluon
import gluonnlp as nlp

In [50]:
elmo_intro = """
Extensive experiments demonstrate that ELMo representations work extremely well in practice.
We first show that they can be easily added to existing models for six diverse and challenging language understanding problems, including textual entailment, question answering and sentiment analysis.
The addition of ELMo representations alone significantly improves the state of the art in every case, including up to 20% relative error reductions.
For tasks where direct comparisons are possible, ELMo outperforms CoVe (McCann et al., 2017), which computes contextualized representations using a neural machine translation encoder.
Finally, an analysis of both ELMo and CoVe reveals that deep representations outperform those derived from just the top layer of an LSTM.
Our trained models and code are publicly available, and we expect that ELMo will provide similar gains for many other NLP problems.
"""

elmo_intro_file = 'elmo_intro.txt'
with io.open(elmo_intro_file, 'w', encoding='utf8') as f:
    f.write(elmo_intro)

dataset = nlp.data.TextLineDataset(elmo_intro_file, 'utf8')
print(len(dataset))
print(dataset[2]) # print an example sentence from the input data

7
We first show that they can be easily added to existing models for six diverse and challenging language understanding problems, including textual entailment, question answering and sentiment analysis.


In [54]:
tokenizer = nlp.data.SacreMosesTokenizer()
dataset = dataset.transform(tokenizer)
dataset = dataset.transform(lambda x: ['<bos>'] + x + ['<eos>'])
print(dataset[2]) # print the same tokenized sentence as above

['<bos>', 'We', 'first', 'show', 'that', 'they', 'can', 'be', 'easily', 'added', 'to', 'existing', 'models', 'for', 'six', 'diverse', 'and', 'challenging', 'language', 'understanding', 'problems', ',', 'including', 'textual', 'entailment', ',', 'question', 'answering', 'and', 'sentiment', 'analysis', '.', '<eos>']


Now, let's transform each word into a series of tokens. 

0-255 values come from UTF-8, and some tokens have a special meaning:
  * bos_id (256) – The index of beginning of the sentence character
  * eos_id (257) – The index of end of the sentence character
  * bow_id (258) – The index of beginning of the word character
  * eow_id (259) – The index of end of the word character
  * pad_id (260) – The index of padding character is 260

In [55]:
vocab = nlp.vocab.ELMoCharVocab()
dataset = dataset.transform(lambda x: (vocab[x], len(x)), lazy=False)

In [63]:
print(dataset[2])

([[258, 256, 259, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260], [258, 87, 101, 259, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260], [258, 102, 105, 114, 115, 116, 259, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260], [258, 115, 104, 111, 119, 259, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 26

In [58]:
batch_size = 2
dataset_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
                                              nlp.data.batchify.Stack())
data_loader = gluon.data.DataLoader(dataset,
                                    batch_size=batch_size,
                                    batchify_fn=dataset_batchify_fn)

  'Padding value is not given and will be set automatically to 0 '


In [59]:
elmo_bilm, _ = nlp.model.get_model('elmo_2x1024_128_2048cnn_1xhighway',
                                   dataset_name='gbw',
                                   pretrained=True,
                                   ctx=mx.cpu())
print(elmo_bilm)

Downloading /home/ec2-user/.mxnet/models/elmo_2x1024_128_2048cnn_1xhighway_gbw-8c9257d9.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/elmo_2x1024_128_2048cnn_1xhighway_gbw-8c9257d9.zip...
ELMoBiLM(
  (_elmo_char_encoder): ELMoCharacterEncoder(
    (_char_embedding): Embedding(262 -> 16, float32)
    (_convolutions): ConvolutionalEncoder(
      (_convs): HybridConcurrent(
        (0): HybridSequential(
          (0): Conv1D(16 -> 32, kernel_size=(1,), stride=(1,))
          (1): HybridLambda(<lambda>)
          (2): Activation(relu)
        )
        (1): HybridSequential(
          (0): Conv1D(16 -> 32, kernel_size=(2,), stride=(1,))
          (1): HybridLambda(<lambda>)
          (2): Activation(relu)
        )
        (2): HybridSequential(
          (0): Conv1D(16 -> 64, kernel_size=(3,), stride=(1,))
          (1): HybridLambda(<lambda>)
          (2): Activation(relu)
        )
        (3): HybridSequential(
          (0): Conv1D(16 -> 128, kerne

In [60]:
def get_features(data, valid_lengths):
    length = data.shape[1]
    hidden_state = elmo_bilm.begin_state(mx.nd.zeros, batch_size=batch_size)
    mask = mx.nd.arange(length).expand_dims(0).broadcast_axes(axis=(0,), size=(batch_size,))
    mask = mask < valid_lengths.expand_dims(1).astype('float32')
    output, hidden_state = elmo_bilm(data, hidden_state, mask)
    return output

batch = next(iter(data_loader))
features = get_features(*batch)
print([x.shape for x in features])

[(2, 14, 256), (2, 14, 256), (2, 14, 256)]
