In [2]:
import torch
import torch.nn as nn
from transformers import (
    BertTokenizer,
    BertModel,
    BertForMaskedLM,
    BertForSequenceClassification,
    # BertForMultipleChoice,
    # BertForTokenClassification,
    # BertForQuestionAnswering,
)


### Base Model

For complete guide, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForQuestionAnswering).

In [3]:
base_model = BertModel.from_pretrained("google-bert/bert-base-uncased")
print(base_model)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### Masked language modeling

Example: `[CLS] the quick brown fox [MASK] over the wall [SEP]`

In [4]:
mlm_model = BertForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
print(mlm_model)


Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [None]:
inputs = [
    "canberra is the [MASK] of australia",
    "the quick brown fox [MASK] over the wall",
    "apple [MASK] is delicious",
    ]
encoded_inputs = tokenizer(inputs, padding=True, truncation=True, max_length=50, return_tensors="pt")
print("Text encoded as tokens:")
print(encoded_inputs.input_ids)
print("Special characters used by the model:")
print(tokenizer.decode(101))
print(tokenizer.decode(102))
print(tokenizer.decode(103))


Text encoded as tokens:
tensor([[  101, 13107,  2003,  1996,   103,  1997,  2660,   102,     0,     0],
        [  101,  1996,  4248,  2829,  4419,   103,  2058,  1996,  2813,   102],
        [  101,  6207,   103,  2003, 12090,   102,     0,     0,     0,     0],
        [  101,  4940,  2003,  1996,   103,  2103,  1997,  3848,   102,     0]])
Special characters used by the model:
[CLS]
[SEP]
[MASK]


In [6]:
mask_positions = encoded_inputs.input_ids == 103
output = mlm_model(**encoded_inputs)
masked_output = output.logits[mask_positions]
masked_texts = tokenizer.decode(masked_output.argmax(dim=-1))
print("Predictions for the MASK:")
print(masked_texts)


Predictions for the MASK:
capital jumped pie capital


### Sequence classification

Example: fine-tune to predict the class label of input text, such as `[CLS] hello, my dog is cute [SEP]`

In [11]:
cls_model = BertForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=3
    )
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
print(cls_model)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
inputs = [
    "hello, my dog is cute",
    "the weather is nice"
]
encoded_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
output         = cls_model(**encoded_inputs)
for i in range(output.logits.shape[0]):
    logits = output.logits[i]
    pred_proba = nn.functional.softmax(logits, dim=0)
    pred_class = torch.argmax(pred_proba)
    print(pred_proba, pred_class)


tensor([0.3836, 0.3341, 0.2823], grad_fn=<SoftmaxBackward0>) tensor(0)
tensor([0.4005, 0.3623, 0.2373], grad_fn=<SoftmaxBackward0>) tensor(0)
