In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [3]:
from collections import defaultdict, Counter
import json

import matplotlib.pyplot as plt
import numpy as np
import torch
import transformers
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def print_encoding(model_inputs, indent=4):
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('siebert/sentiment-roberta-large-english')
# initialize the model
model = AutoModelForSequenceClassification.from_pretrained('siebert/sentiment-roberta-large-english')

In [6]:
inputs = "I'm excited to learn about Hugging Face Transformers!"
tokenized_inputs = tokenizer(inputs, return_tensors='pt')
tokenized_inputs

outputs = model(**tokenized_inputs)
outputs

labels = ['NEGATIVE', 'POSITIVE']
prediction = labels[torch.argmax(outputs.logits)]

print(f'the prediction is: {prediction}')

{'input_ids': tensor([[    0,   100,   437,  2283,     7,  1532,    59, 30581,  3923, 12346,
         34379,   328,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

SequenceClassifierOutput(loss=None, logits=tensor([[-3.7605,  2.9262]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

the prediction is: POSITIVE


In [7]:
# tokenizers

from transformers import DistilBertTokenizer, DistilBertTokenizerFast, AutoTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
print(tokenizer)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
print(tokenizer)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
print(tokenizer)

DistilBertTokenizer(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


In [8]:
input_str = "Hugging Face transformers is great!"
tokenized_inputs = tokenizer(input_str)

print(tokenized_inputs)

{'input_ids': [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:
print_encoding(tokenized_inputs)

{
    input_ids:
        [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102]
    attention_mask:
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}


In [23]:
cls = [tokenizer.cls_token_id]
sep = [tokenizer.sep_token_id]

# tokenization happens in a few steps
input_tokens = tokenizer.tokenize(input_str)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
input_ids_special_tokens = cls + input_ids + sep

decoded_str = tokenizer.decode(input_ids_special_tokens)


print("start:                   ", input_str)
print("tokenize:                ", input_tokens)
print("convert_tokens_to_ids:   ", input_ids)
print("add special tokens:      ", input_ids_special_tokens)
print("---------")
print("decode:                  ", decoded_str)

start:                    Hugging Face transformers is great!
tokenize:                 ['Hu', '##gging', 'Face', 'transform', '##ers', 'is', 'great', '!']
convert_tokens_to_ids:    [20164, 10932, 10289, 11303, 1468, 1110, 1632, 106]
add special tokens:       [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102]
---------
decode:                   [CLS] Hugging Face transformers is great! [SEP]


In [49]:
# for fast tokenizers there is another option too
inputs = tokenizer._tokenizer.encode(input_str)

print(input_str)
print("-" * 5)
print(f"Number of tokens: {len(inputs)}")
print(f"Ids: {inputs.ids}")
print(f"Tokens: {inputs.tokens}")
print(f"special_tokens_mask: {inputs.special_tokens_mask}")

Hugging Face transformers is great!
-----
Number of tokens: 10
Ids: [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102]
Tokens: ['[CLS]', 'Hu', '##gging', 'Face', 'transform', '##ers', 'is', 'great', '!', '[SEP]']
special_tokens_mask: [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [50]:
# the tokenizer can return pytorch tensors
model_inputs = tokenizer(input_str, return_tensors='pt')
print('PyTorch Tensors:')
print_encoding(model_inputs)

PyTorch Tensors:
{
    input_ids:
        tensor([[  101, 20164, 10932, 10289, 11303,  1468,  1110,  1632,   106,   102]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
}


In [77]:
# we can pass multiple strings to the tokenizer and pad them as we need.
model_inputs = tokenizer(["Hugging Face Transfomers is great!",
                         "The quick brown fox jumps over the lazy dog. Then the dog got up and ran away because she didn't like foxes.",
                         ],
                        return_tensors='pt', 
                        padding=True, 
                        truncation=True)
print(f"Pad token: {tokenizer.pad_token} | Pad token id: {tokenizer.pad_token_id}")
print("Padding:")
print_encoding(model_inputs)

Pad token: [PAD] | Pad token id: 0
Padding:
{
    input_ids:
        tensor([[  101, 20164, 10932, 10289, 13809, 14467, 19134,  1110,  1632,   106,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1109,  3613,  3058, 17594, 15457,  1166,  1103, 16688,  3676,
           119,  1599,  1103,  3676,  1400,  1146,  1105,  1868,  1283,  1272,
          1131,  1238,   112,   189,  1176, 17594,  1279,   119,   102]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])
}


In [79]:
tokenizer.batch_decode(model_inputs.input_ids)

['[CLS] Hugging Face Transfomers is great! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 "[CLS] The quick brown fox jumps over the lazy dog. Then the dog got up and ran away because she didn't like foxes. [SEP]"]

In [80]:
tokenizer.batch_decode(model_inputs.input_ids, skip_special_tokens=True)

['Hugging Face Transfomers is great!',
 "The quick brown fox jumps over the lazy dog. Then the dog got up and ran away because she didn't like foxes."]