# Chapter 8: Pretrained Models for Natural Language Processing

In [None]:
!pip install portalocker transformers

## 8.3 Natural Language Processing

### 8.3.1 Model

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step1.png)

In [15]:
import torch
import torchtext

roberta_base = torchtext.models.ROBERTA_BASE_ENCODER
roberta_base

RobertaBundle(_encoder_conf=RobertaEncoderConf(vocab_size=50265, embedding_dim=768, ffn_dimension=3072, padding_idx=1, max_seq_len=514, num_attention_heads=12, num_encoder_layers=12, dropout=0.1, scaling=None, normalize_before=False), _path='https://download.pytorch.org/models/text/roberta.base.encoder.pt', _head=None, transform=<function <lambda> at 0x7f5d970f10d0>)

In [2]:
model = roberta_base.get_model()
model

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(50265, 768, padding_idx=1)
      (layers): TransformerEncoder(
        (layers): ModuleList(
          (0-11): 12 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (linear1): Linear(in_features=768, out_features=3072, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=3072, out_features=768, bias=True)
            (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (positional_embedding): PositionalEmbedding(
        (embedding): Embedding(51

In [3]:
model.head

### 8.3.2 Transforms

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step3.png)

In [4]:
transform_fn = roberta_base.transform()
transform_fn

Sequential(
  (0): GPT2BPETokenizer()
  (1): VocabTransform(
    (vocab): Vocab()
  )
  (2): Truncate()
  (3): AddToken()
  (4): AddToken()
)

#### 8.3.2.1 Tokenizer

In [5]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]

tokenizer = transform_fn[0]
tokenized = tokenizer(input_batch)
tokenized

[['40', '716', '1107', '24976', '428', '1781', '0'],
 ['1212', '1781', '318', '1165', '8253', '0']]

In [6]:
tokenizer('I'), tokenizer.decode(tokenizer('I'))

(['40'], 'I')

In [7]:
tokenizer.decode(tokenized[0])

'I am really liking this course!'

#### 8.3.2.2 Vocabulary

In [8]:
to_vocab = transform_fn[1]
vocab_obj = to_vocab.vocab

In [9]:
tokens_to_idx = vocab_obj.get_stoi()
tokens_to_idx

{'madeupword0002': 50263,
 'madeupword0000': 50261,
 '50256': 50260,
 '50009': 50259,
 '49731': 50257,
 '48069': 50255,
 '47654': 50253,
 '47571': 50252,
 '47198': 50251,
 '46600': 50250,
 '45545': 50248,
 '45003': 50245,
 '44320': 50244,
 '43361': 50242,
 '43177': 50241,
 '42728': 50238,
 '42496': 50236,
 '42090': 50233,
 '42089': 50232,
 '41551': 50231,
 '41383': 50230,
 '41297': 50229,
 '40242': 50228,
 '40241': 50227,
 '40240': 50226,
 '39820': 50223,
 '39803': 50221,
 '39655': 50213,
 '39253': 50210,
 '39172': 50208,
 '37444': 50201,
 '36174': 50198,
 '35496': 50195,
 '34027': 50190,
 '33813': 50189,
 '33477': 50188,
 '33434': 50186,
 '32047': 50184,
 '31957': 50183,
 '31783': 50181,
 '31765': 50180,
 '31576': 50178,
 '31032': 50175,
 '30906': 50174,
 '30905': 50173,
 '30212': 50168,
 '30209': 50165,
 '30202': 50163,
 '29372': 50162,
 '29342': 50161,
 '25992': 50159,
 '25502': 50158,
 '24934': 50156,
 '23282': 50152,
 '22757': 50151,
 '22686': 50150,
 '18472': 50149,
 '12781': 501

In [10]:
tokens_to_idx['40']

100

In [11]:
vocab_obj(tokenized[0])

[100, 524, 269, 25896, 42, 768, 328]

In [12]:
indices = to_vocab(tokenized)
indices

[[100, 524, 269, 25896, 42, 768, 328], [713, 768, 16, 350, 6336, 328]]

#### 8.3.2.3 Max Length

In [13]:
truncate = transform_fn[2]
truncate.max_seq_len

254

In [14]:
truncated = truncate(indices)
truncated

[[100, 524, 269, 25896, 42, 768, 328], [713, 768, 16, 350, 6336, 328]]

#### 8.3.2.4 Special Tokens

In [15]:
prepend_token = transform_fn[3]
prepend_token.begin, prepend_token.token, vocab_obj.lookup_token(prepend_token.token)

(True, 0, '<s>')

In [16]:
append_token = transform_fn[4]
append_token.begin, append_token.token, vocab_obj.lookup_token(append_token.token)

(False, 2, '</s>')

In [17]:
prepended = prepend_token(truncated)
prepended

[[0, 100, 524, 269, 25896, 42, 768, 328], [0, 713, 768, 16, 350, 6336, 328]]

In [18]:
appended = append_token(prepended)
appended

[[0, 100, 524, 269, 25896, 42, 768, 328, 2],
 [0, 713, 768, 16, 350, 6336, 328, 2]]

In [19]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]
transformed = transform_fn(input_batch)
transformed

[[0, 100, 524, 269, 25896, 42, 768, 328, 2],
 [0, 713, 768, 16, 350, 6336, 328, 2]]

### 8.3.3 Inference

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step5.png)

In [20]:
from torchtext.functional import to_tensor

to_tensor(transformed)

ValueError: expected sequence of length 9 at dim 1 (got 8)

In [21]:
vocab_obj.lookup_indices(['<pad>'])

[1]

In [22]:
model_input = to_tensor(transformed, padding_value=1)
model_input, model_input.shape

(tensor([[    0,   100,   524,   269, 25896,    42,   768,   328,     2],
         [    0,   713,   768,    16,   350,  6336,   328,     2,     1]]),
 torch.Size([2, 9]))

In [23]:
model.eval()
output = model(model_input)
output.shape

torch.Size([2, 9, 768])

### 8.3.4 Attaching a Head

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step1.png)

In [26]:
torch.manual_seed(11)
classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim=768)

In [27]:
classifier_head

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
  (activation_fn): ReLU()
)

In [28]:
model_with_head = roberta_base.get_model(head=classifier_head)
model_with_head

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(50265, 768, padding_idx=1)
      (layers): TransformerEncoder(
        (layers): ModuleList(
          (0-11): 12 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (linear1): Linear(in_features=768, out_features=3072, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=3072, out_features=768, bias=True)
            (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (positional_embedding): PositionalEmbedding(
        (embedding): Embedding(51

In [29]:
model_with_head.eval()
output = model_with_head(model_input)
output, output.shape

(tensor([[-0.0355, -0.0045],
         [-0.0422, -0.0039]], grad_fn=<AddmmBackward0>),
 torch.Size([2, 2]))

### 8.3.5 Logits and Loss Functions

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step2.png)

#### 8.3.5.1 One Logit or Two Logits?

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/one_logit.png)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/two_logits.png)


## 8.4 TensorBoard

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step4.png)

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/empty_tensorboard.png)

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/test')

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/tensorboard_losses.png)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/smooth_slider.png)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/tensorboard_losses_smooth.png)

## 8.6 HuggingFace Pipelines

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/hf_nlp_tasks.png)

In [1]:
from transformers.pipelines import SUPPORTED_TASKS
SUPPORTED_TASKS['text-classification']['default']

{'model': {'pt': ('distilbert-base-uncased-finetuned-sst-2-english',
   'af0f99b'),
  'tf': ('distilbert-base-uncased-finetuned-sst-2-english', 'af0f99b')}}

In [2]:
from transformers import pipeline

model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
classifier = pipeline('text-classification', model=model_name)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step5.png)

In [3]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]

classifier(input_batch)

[{'label': 'POSITIVE', 'score': 0.9997199177742004},
 {'label': 'NEGATIVE', 'score': 0.9996912479400635}]

### 8.6.1 Transforms / Tokenizer

In [4]:
classifier.tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [5]:
tokenized_dict = classifier.tokenizer(input_batch)
tokenized_dict

{'input_ids': [[101, 1045, 2572, 2428, 16663, 2023, 2607, 999, 102], [101, 2023, 2607, 2003, 2205, 8552, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [6]:
classifier.tokenizer.decode(tokenized_dict['input_ids'][0])

'[CLS] i am really liking this course! [SEP]'

In [7]:
from transformers import AutoTokenizer

hf_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

In [8]:
tokenized_output = hf_tokenizer(input_batch, add_special_tokens=True, padding=True, return_tensors='pt')
tokenized_output

{'input_ids': tensor([[  101,  1045,  2572,  2428, 16663,  2023,  2607,   999,   102],
        [  101,  2023,  2607,  2003,  2205,  8552,   999,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])}

### 8.6.2 Model

In [9]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [10]:
classifier.model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.29.2",
  "vocab_size": 30522
}

In [11]:
from transformers import AutoModel
headless_model = AutoModel.from_pretrained('distilbert-base-uncased')

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
headless_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [16]:
import torch
headless_model.eval()

with torch.inference_mode():
    output = headless_model(tokenized_output['input_ids'])
    
output['last_hidden_state'].shape

torch.Size([2, 9, 768])

## 8.7 Generative Models

In [78]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

model = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [79]:
sentence = "Hello, how are you"

In [80]:
tokenized = tokenizer(sentence, return_tensors="pt")
tokenized

{'input_ids': tensor([[15496,    11,   703,   389,   345]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [81]:
outputs = model(**tokenized)
outputs['logits'].shape

torch.Size([1, 5, 50257])

In [82]:
probabilities = torch.nn.functional.softmax(outputs['logits'][0], dim=1)
values, indices = torch.topk(probabilities, 1)
values, indices

(tensor([[0.0960],
         [0.1005],
         [0.0908],
         [0.6630],
         [0.2651]], grad_fn=<TopkBackward0>),
 tensor([[  11],
         [ 314],
         [ 546],
         [ 345],
         [1804]]))

In [83]:
predictions = tokenizer.decode(indices[:, 0])
predictions

', I about you doing'

In [84]:
tokens = [tokenizer.decode(t) for t in tokenized['input_ids'][0]]
predicted_tokens = predictions.split(' ')

for i, p in enumerate(predicted_tokens):
    print(f"{i+1}. Tokens so far: {' '.join(tokens[:i+1])}\n   Predicted token to follow: {p}")

1. Tokens so far: Hello
   Predicted token to follow: ,
2. Tokens so far: Hello ,
   Predicted token to follow: I
3. Tokens so far: Hello ,  how
   Predicted token to follow: about
4. Tokens so far: Hello ,  how  are
   Predicted token to follow: you
5. Tokens so far: Hello ,  how  are  you
   Predicted token to follow: doing
