In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [3]:
from collections import defaultdict, Counter
import json

import matplotlib.pyplot as plt
import numpy as np
import torch
import transformers
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def print_encoding(model_inputs, indent=4):
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('siebert/sentiment-roberta-large-english')
# initialize the model
model = AutoModelForSequenceClassification.from_pretrained('siebert/sentiment-roberta-large-english')

In [6]:
inputs = "I'm excited to learn about Hugging Face Transformers!"
tokenized_inputs = tokenizer(inputs, return_tensors='pt')
tokenized_inputs

outputs = model(**tokenized_inputs)
outputs

labels = ['NEGATIVE', 'POSITIVE']
prediction = labels[torch.argmax(outputs.logits)]

print(f'the prediction is: {prediction}')

{'input_ids': tensor([[    0,   100,   437,  2283,     7,  1532,    59, 30581,  3923, 12346,
         34379,   328,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

SequenceClassifierOutput(loss=None, logits=tensor([[-3.7605,  2.9262]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

the prediction is: POSITIVE


In [7]:
# tokenizers

from transformers import DistilBertTokenizer, DistilBertTokenizerFast, AutoTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
print(tokenizer)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
print(tokenizer)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
print(tokenizer)

DistilBertTokenizer(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


In [8]:
input_str = "Hugging Face transformers is great!"
tokenized_inputs = tokenizer(input_str)

print(tokenized_inputs)

{'input_ids': [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:
print_encoding(tokenized_inputs)

{
    input_ids:
        [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102]
    attention_mask:
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}


In [10]:
cls = [tokenizer.cls_token_id]
sep = [tokenizer.sep_token_id]

# tokenization happens in a few steps
input_tokens = tokenizer.tokenize(input_str)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
input_ids_special_tokens = cls + input_ids + sep

decoded_str = tokenizer.decode(input_ids_special_tokens)


print("start:                   ", input_str)
print("tokenize:                ", input_tokens)
print("convert_tokens_to_ids:   ", input_ids)
print("add special tokens:      ", input_ids_special_tokens)
print("---------")
print("decode:                  ", decoded_str)

start:                    Hugging Face transformers is great!
tokenize:                 ['Hu', '##gging', 'Face', 'transform', '##ers', 'is', 'great', '!']
convert_tokens_to_ids:    [20164, 10932, 10289, 11303, 1468, 1110, 1632, 106]
add special tokens:       [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102]
---------
decode:                   [CLS] Hugging Face transformers is great! [SEP]


In [11]:
# for fast tokenizers there is another option too
inputs = tokenizer._tokenizer.encode(input_str)

print(input_str)
print("-" * 5)
print(f"Number of tokens: {len(inputs)}")
print(f"Ids: {inputs.ids}")
print(f"Tokens: {inputs.tokens}")
print(f"special_tokens_mask: {inputs.special_tokens_mask}")

Hugging Face transformers is great!
-----
Number of tokens: 10
Ids: [101, 20164, 10932, 10289, 11303, 1468, 1110, 1632, 106, 102]
Tokens: ['[CLS]', 'Hu', '##gging', 'Face', 'transform', '##ers', 'is', 'great', '!', '[SEP]']
special_tokens_mask: [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [12]:
# the tokenizer can return pytorch tensors
model_inputs = tokenizer(input_str, return_tensors='pt')
print('PyTorch Tensors:')
print_encoding(model_inputs)

PyTorch Tensors:
{
    input_ids:
        tensor([[  101, 20164, 10932, 10289, 11303,  1468,  1110,  1632,   106,   102]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
}


In [13]:
# we can pass multiple strings to the tokenizer and pad them as we need.
model_inputs = tokenizer(["Hugging Face Transfomers is great!",
                         "The quick brown fox jumps over the lazy dog. Then the dog got up and ran away because she didn't like foxes.",
                         ],
                        return_tensors='pt', 
                        padding=True, 
                        truncation=True)
print(f"Pad token: {tokenizer.pad_token} | Pad token id: {tokenizer.pad_token_id}")
print("Padding:")
print_encoding(model_inputs)

Pad token: [PAD] | Pad token id: 0
Padding:
{
    input_ids:
        tensor([[  101, 20164, 10932, 10289, 13809, 14467, 19134,  1110,  1632,   106,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1109,  3613,  3058, 17594, 15457,  1166,  1103, 16688,  3676,
           119,  1599,  1103,  3676,  1400,  1146,  1105,  1868,  1283,  1272,
          1131,  1238,   112,   189,  1176, 17594,  1279,   119,   102]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])
}


In [14]:
tokenizer.batch_decode(model_inputs.input_ids)

['[CLS] Hugging Face Transfomers is great! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 "[CLS] The quick brown fox jumps over the lazy dog. Then the dog got up and ran away because she didn't like foxes. [SEP]"]

In [15]:
tokenizer.batch_decode(model_inputs.input_ids, skip_special_tokens=True)

['Hugging Face Transfomers is great!',
 "The quick brown fox jumps over the lazy dog. Then the dog got up and ran away because she didn't like foxes."]

In [16]:
# Models

In [17]:
from transformers import AutoTokenizer, AutoModel

# initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('siebert/sentiment-roberta-large-english')
# initialize the model
# when you need the model for just getting its representations.
model_rep = AutoModel.from_pretrained('siebert/sentiment-roberta-large-english')

# when you need the model with a particular head. 
# For instance, in this case we have sequence classification head for sentiment analysis.
model_head = AutoModelForSequenceClassification.from_pretrained('siebert/sentiment-roberta-large-english')

Some weights of the model checkpoint at siebert/sentiment-roberta-large-english were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
inputs = "I'm excited to learn about Hugging Face Transformers!"
model_inputs = tokenizer(inputs, return_tensors="pt")
model_inputs

outputs_representation = model_rep(**model_inputs)
outputs_representation

outputs_sequence_classification_head = model_head(**model_inputs)
outputs_sequence_classification_head

{'input_ids': tensor([[    0,   100,   437,  2283,     7,  1532,    59, 30581,  3923, 12346,
         34379,   328,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0525, -0.1342, -0.5488,  ...,  0.1036,  0.0879, -0.2249],
         [-0.0568, -0.1349, -0.5527,  ...,  0.1023,  0.0883, -0.2277],
         [-0.0568, -0.1349, -0.5525,  ...,  0.1026,  0.0884, -0.2279],
         ...,
         [-0.0576, -0.1346, -0.5536,  ...,  0.1007,  0.0885, -0.2269],
         [-0.0565, -0.1349, -0.5526,  ...,  0.1028,  0.0883, -0.2282],
         [-0.0578, -0.1352, -0.5541,  ...,  0.1014,  0.0881, -0.2279]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.1733, -0.9172,  0.3356,  ...,  0.5361, -0.4033, -0.8523]],
       grad_fn=<TanhBackward0>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.7605,  2.9262]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [133]:
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification

check_point = 'distilbert-base-cased'
model_auto = AutoModelForSequenceClassification.from_pretrained(check_point)
# model_bert = DistilBertForSequenceClassification.from_pretrained(check_point, num_labels = 2)

tokenizer = AutoTokenizer.from_pretrained(check_point)

input_str = "the movie was good"
tokenized_inputs = tokenizer(input_str, return_tensors='pt')
tokenized_inputs

model_outputs = model_auto(**tokenized_inputs)
model_outputs

print(f"Distribution over labels: {torch.softmax(model_outputs.logits, dim=1)}")

labels = ['NEGATIVE', 'POSITIVE']
print(labels[torch.argmax(model_outputs.logits)])

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias

{'input_ids': tensor([[ 101, 1103, 2523, 1108, 1363,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0588, -0.0790]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Distribution over labels: tensor([[0.5344, 0.4656]], grad_fn=<SoftmaxBackward0>)
NEGATIVE


In [135]:
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification

check_point = 'siebert/sentiment-roberta-large-english'
model_auto = AutoModelForSequenceClassification.from_pretrained(check_point)
# model_bert = DistilBertForSequenceClassification.from_pretrained(check_point, num_labels = 2)

tokenizer = AutoTokenizer.from_pretrained(check_point)

input_str = "the movie was good"
tokenized_inputs = tokenizer(input_str, return_tensors='pt')
tokenized_inputs

model_outputs = model_auto(**tokenized_inputs)
model_outputs

print(f"Distribution over labels: {torch.softmax(model_outputs.logits, dim=1)}")

labels = ['NEGATIVE', 'POSITIVE']
print(labels[torch.argmax(model_outputs.logits)])

{'input_ids': tensor([[   0,  627, 1569,   21,  205,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

SequenceClassifierOutput(loss=None, logits=tensor([[-3.6781,  2.8360]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Distribution over labels: tensor([[0.0015, 0.9985]], grad_fn=<SoftmaxBackward0>)
POSITIVE
