In [61]:
from rich import print
from transformers import pipeline
import torch.nn.functional as F

Personal playground containing recreations of examples from the Hugging Face tutorial course.

### Table of Contents
* [`pipeline` demos](#pipeline-demos)
* [`pipeline` under the hood](#pipeline-breakdown)
* [`Model` intro](#model-intro)
* [`Tokenizer` intro](#tokenizer-intro)

In [63]:
#### TLDR Code ####
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

In [64]:
predictions = F.softmax(outputs.logits, dim=-1)
print(predictions) # Convert logits to probabilities
print(model.config.id2label)

### Quick Access to Task-specific Models with `pipeline`<a class="anchor" id="pipeline-demos"></a>

In [3]:
# Basic usage of pipeline
classifier = pipeline("sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9516071081161499}]

In [4]:
# Pass multiple texts to object
classifier([
  "I've been waiting for a HuggingFace course my whole life",
    "I hate this so much!"
])

[{'label': 'POSITIVE', 'score': 0.9516071081161499},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [5]:
# zero shot classification pipeline
classifier = pipeline("zero-shot-classification")
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"]
)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445985913276672, 0.11197447776794434, 0.04342697560787201]}

In [6]:
# Text Generation
generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to build a complete Java application using J2EE 8 and Java EE 8, both of which include cross platform execution frameworks. We will cover both Java EE and Java 8 to implement code generation and a series'}]

In [7]:
# Text Generation with specific model specified (specify as pipeline(<task>, model=<model>))
generator = pipeline("text-generation", model="distilgpt2")
generator(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2
)

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to make a good bet. It gives you the information that you need to follow the course. If you'},
 {'generated_text': 'In this course, we will teach you how to be a good programmer. With this course, we will make you more able to learn how to use'}]

In [8]:
# fill-mask pipeline will predict missing words in a sentence
unmasker = pipeline("fill-mask")
unmasker("This course will teach you all about <mask> models.", top_k=2)

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[{'score': 0.196197971701622,
  'token': 30412,
  'token_str': ' mathematical',
  'sequence': 'This course will teach you all about mathematical models.'},
 {'score': 0.04052729904651642,
  'token': 38163,
  'token_str': ' computational',
  'sequence': 'This course will teach you all about computational models.'}]

In [9]:
# NER (Named Entity Recognition) pipeline identifies entities such as persons, orgs, locations
ner = pipeline("ner", grouped_entities=True)
ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]



[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.97960186,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [10]:
# QA pipeline extracts answers to a question from a given context
question_answerer = pipeline("question-answering")
question_answerer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn"
)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.6949759125709534, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}

### Breaking Down Pipeline<a class="anchor" id="pipeline-breakdown"></a>
The `pipeline` function is an abstraction of three separate steps:
* Preprocessing with a Tokenizer (Raw Text => Input IDs)
* Process Inputs with a Model (Input IDs => Logits)
* Postprocessing (Logist => Predictions)

We demonstrate this with the following example of the sentiment analysis task

#### Preprocessing

In [14]:
from transformers import AutoTokenizer

In [15]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [18]:
raw_inputs = ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

# Two outputs
# * input_ids: two rows of integers that are unique identifiers of tokens in each sentence
# * attention_mask: 
print(inputs)

#### Model

In [19]:
from transformers import AutoModel

# This archiecture is base Transformer module: inputs => hidden states
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModel.from_pretrained(checkpoint) # Outputs hidden states (a.k.a. features)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
# Per input, we receive a high dimensional vector representation of the input
outputs = model(**inputs)

# Size will be: # (batch size, sequence length, hidden size)
# - Batch size: # of sequences processed at a time (2)
# - Sequence length: Length of numerical repr. of sequence (16)
# - Hidden size: Vector dimension of each model input (768)
print(outputs.last_hidden_state.shape)

In [23]:
# HF also has models that include task-specific heads (output logits, not hidden states)
# List: ForCausalLM, ForMaskedLM, ForMUltipleChoice, ForQuestionAnswering,
#   ForSequenceClassification, ForTokenClassification

from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [26]:
print(outputs)

#### Post Processing
* Model predicted logits (raw, unnormalized scores outputted by model's last layer)
* To convert to a probability, they must go through a `softmax` layer

In [28]:
predictions = F.softmax(outputs.logits, dim=-1)
print(predictions) # Convert logits to probabilities

In [29]:
# Map the indices to the corresponding label
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

### Intro to `Model`<a class="anchor" id="model-intro"></a>

In [30]:
from transformers import BertConfig, BertModel

# Build the config
config = BertConfig()

# Build model from config
model = BertModel(config)

# The above will created a model initialized with random values (untrained)

In [31]:
print(config)

In [32]:
# Continuing from the above, we could retrain but that's super costly
# Let's reuse by loading saved checkpoints
model = BertModel.from_pretrained("bert-base-cased")

# Loading checkpoints should not interfere with the rest of the pipeline, assuming the architecture remains the same

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
model.save_pretrained("/n/fs/nlp-jy1682/.cache/huggingface/misc/")
# Saves `config.json` and `pytorch_mode.bin` files to specified folder
# * `pytorch_model.bin`: state dictionary (contains all model weights)
# * `config.json: Save model locally (attributes of model arch. + metadata)

In [34]:
# Inference
# - Models can only process numbers, so we must tokenize first
sequences = ["Hello!", "Cool.", "Nice!"]
encoded_sequences = [
    [101, 7592, 999, 102],
    [101, 4658, 1012, 102],
    [101, 3835, 999, 102],
]
model_inputs = torch.tensor(encoded_sequences)
output = model(model_inputs)
print(output) # Outputs hidden state repr. (since this is BERT model)

### Intro to `Tokenizer`<a class="anchor" id="tokenizer-intro"></a>
Encoding: Translate text to numbers
1. Tokenization: split text into words
2. Conversion to input IDs

In [35]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased") # Same checkpoint can be used for BERT Model, Tokenizer

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [36]:
# Alternative
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [40]:
print(tokenizer("Using a Transformer network is simple"))

In [42]:
tokens = tokenizer.tokenize("Using a Transformer network is simple")
print(tokens) # Subword tokenizer

In [43]:
# tokens to input IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

In [44]:
decoded_string = tokenizer.decode(ids)
print(decoded_string)

### Towards Multiple Sentences<a class="anchor" id="multiple-sentences"></a>
* How to handle multiple sequences (of different lengths)?
* Are vocab indices the only inputs that allow a model to work well?

In [46]:
# - Models expect a batch of inputs -
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load Tokenizer, Model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

# Sentence -> Tokens -> IDs
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

# Pass to model
input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)
output = model(input_ids)
print("Logits:", output.logits)

In [48]:
# - Padding Inputs w/ Different Lengths -
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

# The representations for the second sentence are different in seq2_ids and batched_ids
# This is because the representations are contextualized

In [50]:
# To get the same representation, the attention layers should be asked to ignore padding tokens
attention_mask = [
    [1, 1, 1],
    [1, 1, 0]
]
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits) # Second sentence repr. is same as one when seq2_ids is passed in