In [86]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [87]:
# The vocab is an ordered dictionary - key/value pairs.
# This is how to see which tokens are associated with a particular word.

bert_vocab = tokenizer.vocab

print(bert_vocab['[CLS]'])
print(bert_vocab['[SEP]'])
print(bert_vocab['[PAD]'])

print(bert_vocab['hello'])
print(bert_vocab['world'])

101
102
0
7592
2088


In [88]:
## Displaying format of inputs to model

MAX_LEN = 10  # This value could be set as 256, 512 etc.

sentence1 = 'Hello there.'

encoded_dict = tokenizer.encode_plus(
    sentence1,  # Sentence to encode.
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    max_length=MAX_LEN,  # Pad or truncate.
    padding='max_length',
    return_tensors='pt',  # Return pytorch tensors.
)

encoded_dict

{'input_ids': tensor([[ 101, 7592, 2045, 1012,  102,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [89]:
input_ids = encoded_dict['input_ids'][0]
token_type_ids = encoded_dict['token_type_ids'][0]
att_mask = encoded_dict['attention_mask'][0]

# These are torch tensors.
print(input_ids)
print(token_type_ids)
print(att_mask)


tensor([ 101, 7592, 2045, 1012,  102,    0,    0,    0,    0,    0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])


In [90]:
## Decoding a sequence of tokens

# skip_special_tokens – if this is set to True, then special tokens will be replaced.

# Note that do_lower_case=True in the tokenizer.
# This is why all text is lower case.

a = tokenizer.decode(input_ids,
                     skip_special_tokens=False)

b = tokenizer.decode(input_ids,
                     skip_special_tokens=True)

print(a)
print(b)

[CLS] hello there. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]
hello there.


In [91]:
from transformers import XLMRobertaTokenizerFast

MODEL_TYPE = 'xlm-roberta-base'
tokenizer = XLMRobertaTokenizerFast.from_pretrained(MODEL_TYPE)

In [92]:
tokenizer.vocab_size

250002

In [93]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [94]:
sentence1 = 'Hello there.'

encoded_dict = tokenizer.encode_plus(
    sentence1,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding="max_length",
    return_attention_mask=True,
    return_tensors='pt'  # return pytorch tensors
)

print(encoded_dict)

{'input_ids': tensor([[    0, 35378,  2685,     5,     2,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}


In [95]:
input_ids = encoded_dict['input_ids'][0]
att_mask = encoded_dict['attention_mask'][0]

print(input_ids)
print(att_mask)

tensor([    0, 35378,  2685,     5,     2,     1,     1,     1,     1,     1])
tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])


In [96]:
sentence1 = 'Hello there.'
sentence2 = 'How are you?'

encoded_dict = tokenizer.encode_plus(
    sentence1, sentence2,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'  # return pytorch tensors
)

print(encoded_dict)

{'input_ids': tensor([[    0, 35378,  2685,     5,     2,     2, 11249,   621,   398,    32,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [97]:
input_ids = encoded_dict['input_ids'][0]
att_mask = encoded_dict['attention_mask'][0]

# These are torch tensors.
print(input_ids)
print(att_mask)

tensor([    0, 35378,  2685,     5,     2,     2, 11249,   621,   398,    32,
            2])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [98]:
a = tokenizer.decode(input_ids,
                     skip_special_tokens=False)

b = tokenizer.decode(input_ids,
                     skip_special_tokens=True)

print(a)
print(b)

<s> Hello there.</s></s> How are you?</s>
Hello there. How are you?


In [103]:
MAX_LEN = 15  # This value could be set as 256, 512 etc.

sentence1 = 'Hello there. How are you? Have a nice day. This is a test?'

encoded_dict = tokenizer(
    sentence1,
    max_length=MAX_LEN,
    stride=3,
    pad_to_max_length=True,
    return_overflowing_tokens=True,
)
print(encoded_dict)
# print(encoded_dict['input_ids'])
# print(encoded_dict['overflowing_tokens'])

decoded_line = tokenizer.decode(
    token_ids=encoded_dict['input_ids'][0],
)
print(decoded_line)

{'input_ids': [[0, 35378, 2685, 5, 11249, 621, 398, 32, 31901, 10, 26267, 5155, 5, 3293, 2], [0, 5155, 5, 3293, 83, 10, 3034, 32, 2, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], 'overflow_to_sample_mapping': [0, 0]}
<s> Hello there. How are you? Have a nice day. This</s>
