In [1]:
import os
import torch
import allennlp
from allennlp.data import Vocabulary, Instance
from allennlp.data.fields import TextField, LabelField
from simple_classifier.dataset_reader import YelpReviewJsonLinesReader
from allennlp.data.data_loaders import SimpleDataLoader

from simple_classifier.model import SimpleClassifier
from allennlp.modules import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.models import Model

# Intro
This code is a simplified implementation of what happens during an AllenNLP training loop.
Normally, you will NOT need to write code like this, since the "train" command will 
handle all of it for you based on what's in your config file. But we are going to show you
the essence of what happens when you call the "train" command.

## Reading
Here, we'll instantiate the `DatasetReader` we made for our Yelp review dataset. 
Note that we will not pass it any arguments, and instead use its default values for `tokenizer` and `token_indexers`.
A `tokenizer` turns a string into a list of tokens. 
Don't worry about `token_indexers` for now--just know that it's used to turn tokens into integers.

In [2]:
reader = YelpReviewJsonLinesReader()
print("Reader tokenizer:", reader.tokenizer)
print("Reader token indexers:", reader.token_indexers)

# Load our dev data and use list() to force it all into memory
instances = list(reader.read('data/dev_500.jsonl'))
sample_instance = instances[0]
print("\nExample instance:\n\n", sample_instance)

Reader tokenizer: <allennlp.data.tokenizers.letters_digits_tokenizer.LettersDigitsTokenizer object at 0x7f5cdef997c0>
Reader token indexers: {'tokens': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer object at 0x7f5cdef99af0>}

Example instance:

 Instance with fields:
 	 text: TextField of length 48 with text: 
 		[I, took, a, sampler, plate, which, was, good, but, it, was, pricy, ,, almost, about, CAD, 23, ,,
		personally, I, didn, ', t, like, the, salad, ,, there, was, a, kind, of, sea, vegetable, and, not,
		my, type, at, all, ., Place, is, quite, and, good, service, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 label: LabelField with label: 3 in namespace: 'labels'. 



## Vocabulary Setup
Our vocabulary helps us keep in a single place our mappings between strings and integers. 
Vocabularies can be thought of very simply as nested dictionaries with a depth of 2: the first key is the **namespace**, which helps keep logically distinct string values (e.g. POS tags and tokens) distinct; the second key is either the token, in which case the final value will be the token's corresponding integer value, or the integer value, in which case the value is the token's string value.

In [3]:
vocab = Vocabulary.from_instances(instances)
print("Vocabulary summary:\n\n", vocab)

print("Label mapping:\n", vocab.get_index_to_token_vocabulary("labels"))
print("\nFirst 10 token mappings:\n", {k: v for k, v in vocab.get_index_to_token_vocabulary("tokens").items() if k < 10})

print("\nIndex for \"dog\" in namespace \"tokens\":", vocab.get_token_index("dog", namespace="tokens"))
print("\nToken at index 540 in namespace \"tokens\":", vocab.get_token_from_index(540))

building vocab:   0%|          | 0/500 [00:00<?, ?it/s]

Vocabulary summary:

 Vocabulary with namespaces:
 	Non Padded Namespaces: {'*labels', '*tags'}
 	Namespace: tokens, Size: 6751 
 	Namespace: labels, Size: 5 

Label mapping:
 {0: '5', 1: '4', 2: '2', 3: '1', 4: '3'}

First 10 token mappings:
 {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: '.', 3: 'the', 4: ',', 5: 'and', 6: 'I', 7: 'a', 8: 'to', 9: "'"}

Index for "dog" in namespace "tokens": 556

Token at index 540 in namespace "tokens": buns


## Data Loading
The next step in a training pipeline is to create a `DataLoader` which will sit on top of a list of `Instance`s and manage turning them into tensors.
Note the complicated structure of `text`: don't worry about it for now, we'll handle it in our next section.

In [4]:
loader = SimpleDataLoader(instances, batch_size=4, vocab=vocab)
batches = [batch for batch in loader]
sample_batch = batches[0]
print(sample_batch)

{'text': {'tokens': {'tokens': tensor([[   6,  219,    7,  ...,    0,    0,    0],
        [   6,  157,   19,  ...,    0,    0,    0],
        [3321,   77,  530,  ...,    0,    0,    0],
        [   6,   10,   47,  ..., 3345,   32,    2]])}}, 'label': tensor([4, 0, 2, 3])}


## Model Setup
Here we will load the model we trained in the previous section and use it to make predictions for our sample batch.
Note the format of the `probs` key in the output: there is one row per item in the batch, and every column corresponds to a particular value in the `"labels"` namespace in `vocab`.

In [6]:
embedding_dim = 50
archive_path = 'model' + os.sep + 'model.tar.gz'
model = Model.from_archive(archive_path)

sample_output = model(text=sample_batch["text"], label=sample_batch["label"])
print(sample_output)

{'probs': tensor([[0.2314, 0.2194, 0.1948, 0.1842, 0.1701],
        [0.2309, 0.1965, 0.1664, 0.1935, 0.2127],
        [0.2301, 0.2283, 0.2022, 0.1832, 0.1561],
        [0.2341, 0.2208, 0.2294, 0.1703, 0.1453]], grad_fn=<SoftmaxBackward>), 'loss': tensor(1.6514, grad_fn=<NllLossBackward>)}


In [9]:
# Exercise 13
# ==============================
new_instances = [
    reader.text_to_instance("Bad and slow service, burger tastes like cardboard", 0),
    reader.text_to_instance("The food here is always so yummy and our kid always has a great time!", 4)
]
small_loader = SimpleDataLoader(new_instances, batch_size=4, vocab=vocab)
small_batch = [batch for batch in small_loader][0]

def print_probs(output):
    for i, class_probs in enumerate(output['probs']):
        print(class_probs)
        print(f"Instance {i}:")
        print("  Text:", new_instances[i]["text"].tokens)
        print("  Text:", new_instances[i]["label"].label) 
        for rating in ["1", "2", "3", "4", "5"]:
            index = vocab.get_token_index(rating, namespace="labels")
            print(f"  {rating} stars: {class_probs[index].item() * 100:.2f}%")
            
print_probs(model(**small_batch))


tensor([0.2158, 0.2018, 0.1912, 0.2086, 0.1826], grad_fn=<UnbindBackward>)
Instance 0:
  Text: [Bad, and, slow, service, ,, burger, tastes, like, cardboard]
  Text: 1
  1 stars: 20.86%
  2 stars: 19.12%
  3 stars: 18.26%
  4 stars: 20.18%
  5 stars: 21.58%
tensor([0.2146, 0.1959, 0.1989, 0.2045, 0.1861], grad_fn=<UnbindBackward>)
Instance 1:
  Text: [The, food, here, is, always, so, yummy, and, our, kid, always, has, a, great, time, !]
  Text: 5
  1 stars: 20.45%
  2 stars: 19.89%
  3 stars: 18.61%
  4 stars: 19.59%
  5 stars: 21.46%


In [None]:
# Exercise 14
# ==============================
vocab