In [31]:
from allennlp.data.tokenizers import Tokenizer, LettersDigitsTokenizer, SpacyTokenizer, CharacterTokenizer, PretrainedTransformerTokenizer, WhitespaceTokenizer
from simple_classifier.dataset_reader import YelpReviewJsonLinesReader

# Set up tokenizers and indexers
letters_digits_tokenizer = LettersDigitsTokenizer()
spacy_tokenizer = SpacyTokenizer()
character_tokenizer = CharacterTokenizer()
distilbert_tokenizer = PretrainedTransformerTokenizer('distilbert-base-uncased')
whitespace_tokenizer = WhitespaceTokenizer()

In [32]:
# Exercise 16
# ==============================
text = "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29."
print(" ".join(x.text for x in letters_digits_tokenizer.tokenize(text)))
print(" ".join(x.text for x in spacy_tokenizer.tokenize(text)))
print(" ".join(x.text for x in whitespace_tokenizer.tokenize(text)))
print(" ".join(x.text for x in character_tokenizer.tokenize(text)))
print(" ".join(x.text for x in distilbert_tokenizer.tokenize(text)))

Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .
Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.
P i e r r e   V i n k e n ,   6 1   y e a r s   o l d ,   w i l l   j o i n   t h e   b o a r d   a s   a   n o n e x e c u t i v e   d i r e c t o r   N o v .   2 9 .
[CLS] pierre vin ##ken , 61 years old , will join the board as a none ##x ##ec ##utive director nov . 29 . [SEP]


In [39]:
# Exercise 17
# ==============================
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data import Vocabulary

# Read train and create a vocabulary
reader = YelpReviewJsonLinesReader(
    tokenizer=LettersDigitsTokenizer(), 
    token_indexers={
        "tokens": SingleIdTokenIndexer(),
        "chars": TokenCharactersIndexer()
    }
)
instances = reader.read('data/train_5000.jsonl')
vocab = Vocabulary.from_instances(instances)

print(vocab)
single_id_indexer = SingleIdTokenIndexer(namespace="tokens")
character_indexer = TokenCharactersIndexer(namespace="token_characters")
print(letters_digits_tokenizer.tokenize(text))
print(single_id_indexer.tokens_to_indices(letters_digits_tokenizer.tokenize(text), vocab))
print(vocab.get_token_from_index(1, "token_characters"))
print(character_tokenizer.tokenize(text))
print(character_indexer.tokens_to_indices(character_tokenizer.tokenize(text), vocab))

building vocab: 0it [00:00, ?it/s]

Vocabulary with namespaces:
 	Non Padded Namespaces: {'*tags', '*labels'}
 	Namespace: tokens, Size: 24300 
 	Namespace: token_characters, Size: 93 
 	Namespace: labels, Size: 5 

[Pierre, Vinken, ,, 61, years, old, ,, will, join, the, board, as, a, nonexecutive, director, Nov, ., 29, .]
{'tokens': [1, 1, 4, 5139, 251, 257, 4, 93, 3402, 3, 2087, 44, 7, 1, 1, 12633, 2, 4671, 2]}
@@UNKNOWN@@
[P, i, e, r, r, e,  , V, i, n, k, e, n, ,,  , 6, 1,  , y, e, a, r, s,  , o, l, d, ,,  , w, i, l, l,  , j, o, i, n,  , t, h, e,  , b, o, a, r, d,  , a, s,  , a,  , n, o, n, e, x, e, c, u, t, i, v, e,  , d, i, r, e, c, t, o, r,  , N, o, v, .,  , 2, 9, .]
{'token_characters': [[37], [6], [2], [9], [9], [2], [1], [65], [6], [8], [24], [2], [8], [25], [1], [72], [53], [1], [16], [2], [4], [9], [7], [1], [5], [12], [11], [25], [1], [15], [6], [12], [12], [1], [36], [5], [6], [8], [1], [3], [10], [2], [1], [22], [5], [4], [9], [11], [1], [4], [7], [1], [4], [1], [8], [5], [8], [2], [32], [2], [14], [13], [3