In [1]:
from allennlp.data.tokenizers import Tokenizer, LettersDigitsTokenizer, SpacyTokenizer, CharacterTokenizer, PretrainedTransformerTokenizer, WhitespaceTokenizer
from simple_classifier.dataset_reader import YelpReviewJsonLinesReader

# Set up tokenizers and indexers
letters_digits_tokenizer = LettersDigitsTokenizer()
spacy_tokenizer = SpacyTokenizer()
character_tokenizer = CharacterTokenizer()
distilbert_tokenizer = PretrainedTransformerTokenizer('distilbert-base-uncased')
whitespace_tokenizer = WhitespaceTokenizer()

In [2]:
# Exercise 16
# ==============================
text = "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29."
print(" ".join(x.text for x in letters_digits_tokenizer.tokenize(text)))
print(" ".join(x.text for x in spacy_tokenizer.tokenize(text)))
print(" ".join(x.text for x in whitespace_tokenizer.tokenize(text)))
print(" ".join(x.text for x in character_tokenizer.tokenize(text)))
print(" ".join(x.text for x in distilbert_tokenizer.tokenize(text)))

Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .
Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.
P i e r r e   V i n k e n ,   6 1   y e a r s   o l d ,   w i l l   j o i n   t h e   b o a r d   a s   a   n o n e x e c u t i v e   d i r e c t o r   N o v .   2 9 .
[CLS] pierre vin ##ken , 61 years old , will join the board as a none ##x ##ec ##utive director nov . 29 . [SEP]


In [17]:
# Exercise 17
# ==============================
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data import Vocabulary

# Read train and create a vocabulary
reader = YelpReviewJsonLinesReader(
    tokenizer=LettersDigitsTokenizer(), 
    token_indexers={
        "tokens": SingleIdTokenIndexer()
    }
)
instances = reader.read('data/train_5000.jsonl')
# Create a vocabulary from the instances we read, relying on the token_indexers that 
# the reader used to create the Instances in order to construct the token <-> digit 
# mapping inside the vocabular
vocab = Vocabulary.from_instances(instances)

print(vocab)
single_id_indexer = SingleIdTokenIndexer(namespace="tokens")
character_indexer = TokenCharactersIndexer(namespace="token_characters")
ldt_output = letters_digits_tokenizer.tokenize(text)
print("letters_digits_tokenizer output:\n\t", ldt_output)
print("single_id_indexer output:\n\t", single_id_indexer.tokens_to_indices(ldt_output, vocab))
print()

ct_output = character_tokenizer.tokenize(text)
print("character_tokenizer output:\n\t", ct_output)
print("character_indexer output:\n\t", character_indexer.tokens_to_indices(ct_output, vocab))
print("token in \"token_characters\" at index 1:\n\t", vocab.get_token_from_index(1, "token_characters"))

building vocab: 0it [00:00, ?it/s]

Vocabulary with namespaces:
 	Non Padded Namespaces: {'*labels', '*tags'}
 	Namespace: tokens, Size: 24300 
 	Namespace: labels, Size: 5 

letters_digits_tokenizer output:
	 [Pierre, Vinken, ,, 61, years, old, ,, will, join, the, board, as, a, nonexecutive, director, Nov, ., 29, .]
single_id_indexer output:
	 {'tokens': [1, 1, 4, 5139, 251, 257, 4, 93, 3402, 3, 2087, 44, 7, 1, 1, 12633, 2, 4671, 2]}

character_tokenizer output:
	 [P, i, e, r, r, e,  , V, i, n, k, e, n, ,,  , 6, 1,  , y, e, a, r, s,  , o, l, d, ,,  , w, i, l, l,  , j, o, i, n,  , t, h, e,  , b, o, a, r, d,  , a, s,  , a,  , n, o, n, e, x, e, c, u, t, i, v, e,  , d, i, r, e, c, t, o, r,  , N, o, v, .,  , 2, 9, .]
character_indexer output:
	 {'token_characters': [[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]