### AllenNLP Tutorial
---

In [1]:
from typing import Iterator, List, Dict

import torch
import torch.optim as optim
import numpy as np

from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary

from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

torch.manual_seed(1)

<torch._C.Generator at 0x10b3a9630>

In [67]:
class PosDatasetReader(DatasetReader):
    def __init__(self, token_indexers=None):
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
    def text_to_instance(self, tokens, tags=None):
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"sentence": sentence_field}
        
        if tags:
            label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field)
            fields["labels"] = label_field
        
        return Instance(fields)
    
    def _read(self, file_path):
        with open(file_path) as f:
            for line in f:
                pairs = line.strip().split()
                sentence, tags = zip(*(pair.split("###") for pair in pairs))
                yield self.text_to_instance([Token(word) for word in sentence], tags)

In [81]:
class LstmTagger(Model):
    
    def __init__(self, word_embeddings, encoder, vocab):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        
    def forward(self, sentence, labels=None):
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)
        return output
    
    def get_metrics(self, reset):
        return {"accuracy": self.accuracy.get_metric(reset)}
    

In [69]:
reader = PosDatasetReader()

In [70]:
train_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/training.txt'))
validation_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/validation.txt'))


0it [00:00, ?it/s][A
2it [00:00, 831.30it/s][A
0it [00:00, ?it/s][A
2it [00:00, 724.97it/s][A

In [71]:
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

02/02/2020 09:01:41 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.

  0%|          | 0/4 [00:00<?, ?it/s][A
100%|██████████| 4/4 [00:00<00:00, 2890.13it/s][A

In [72]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [73]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [82]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab)

In [83]:
if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1

In [84]:
optimizer = optim.Adam(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)

In [86]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=10,
                  cuda_device=cuda_device)
trainer.train()

02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   Beginning training.
02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   Epoch 0/9
02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 203.85792
02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   Training

  0%|          | 0/1 [00:00<?, ?it/s][A
accuracy: 1.0000, loss: 0.0875 ||: 100%|██████████| 1/1 [00:00<00:00, 74.80it/s][A02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   Validating

  0%|          | 0/1 [00:00<?, ?it/s][A
accuracy: 1.0000, loss: 0.0573 ||: 100%|██████████| 1/1 [00:00<00:00, 162.91it/s][A02/02/2020 09:02:33 - INFO - allennlp.training.trainer -                     Training |  Validation
02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   cpu_memory_MB |   203.858  |       N/A
02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   accuracy      |     1.000  |     1.000
02/02/2020 09:02:33 - INFO - allennlp.training.trainer -   loss         

{'peak_cpu_memory_MB': 203.85792,
 'training_duration': '00:00:00',
 'training_start_epoch': 0,
 'training_epochs': 9,
 'epoch': 9,
 'training_accuracy': 1.0,
 'training_loss': 0.004572445061057806,
 'training_cpu_memory_MB': 203.85792,
 'validation_accuracy': 1.0,
 'validation_loss': 0.003880667733028531,
 'best_epoch': 9,
 'best_validation_accuracy': 1.0,
 'best_validation_loss': 0.003880667733028531}

In [87]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

In [88]:
tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)

In [90]:
tag_ids

array([1, 0, 2, 1, 0])

In [91]:
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

['DET', 'NN', 'V', 'DET', 'NN']
