# Fine-tune a Transformer

Using Huggingface and Pytorch we can very easily take what we've learnt here and fine-tune a transformer.

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification

# this is the model we're going to use
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# set up the model with the number of labels in the dataset
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"


dataset = load_dataset("yelp_review_full")
tokenizer = AutoTokenizer.from_pretrained(model_name)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# let's have a look at the dataset
dataset.set_format(type='pandas')
df = dataset['train'][:]
df.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [10]:
# let's tokenize one item to see what it looks like
one = tokenizer(df['text'][0], max_length=512, padding="max_length", return_tensors="pt", truncation=True)
one

{'input_ids': tensor([[  101,  2852,  1012, 18522,  4107,  2673,  1045,  2298,  2005,  1999,
          1037,  2236, 18742,  1012,  2002,  1005,  1055,  3835,  1998,  3733,
          2000,  2831,  2000,  2302,  2108,  9161,  6026,  1025,  2002,  1005,
          1055,  2467,  2006,  2051,  1999,  3773,  2010,  5022,  1025,  2002,
          1005,  1055,  6989,  2007,  1037,  2327,  1011, 18624,  2902,  1006,
         27935,  1007,  2029,  2026,  3008,  2031,  4541,  2000,  2033,  2003,
          2200,  2590,  1999,  2553,  2242,  6433,  1998,  2017,  2342,  5970,
          1025,  1998,  2017,  2064,  2131,  6523,  7941,  2015,  2000,  2156,
         15744,  2302,  2383,  2000,  2156,  2032,  2034,  1012,  2428,  1010,
          2054,  2062,  2079,  2017,  2342,  1029,  1045,  1005,  1049,  3564,
          2182,  2667,  2000,  2228,  1997,  2151, 10821,  1045,  2031,  2055,
          2032,  1010,  2021,  1045,  1005,  1049,  2428,  5059,  1037,  8744,
          1012,   102,     0,     0,  

The input_ids are the tokens generated from the text and the attention_mask is telling the transformer which tokens to pay attention to. Don't worry too much about this. It's just the context. 

In [14]:
one['input_ids'].shape
one['input_ids'].squeeze(0).shape

torch.Size([512])

In [41]:
# let's put this in a torch dataset 
from torch.utils.data import Dataset, DataLoader, random_split

class TrainingSet(Dataset):
    def __init__(self, text, labels, tokenizer=None):
        self.texts = text
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # here we generate the encodings
        encoding = self.tokenizer(self.texts[idx], max_length=512, padding="max_length", return_tensors="pt", truncation=True)
        # which will give us a tensor shap (1,512) - because of return_tensors="pt" i.e pytorch tensors
        # we actually want a shape (512) so we apply squeeze (which reshapes the tensor at 0), which removes the dimension
        # let's also move the data to the gpu each time we get one
        input_ids = encoding['input_ids'].squeeze(0).to(device)
        attention_mask = encoding['attention_mask'].squeeze(0).to(device)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": self.labels[idx]
        }


In [42]:
train = TrainingSet(df['text'], df['label'], tokenizer)

In [43]:
# now we can just get any item in our training set
train[0]

{'input_ids': tensor([  101,  2852,  1012, 18522,  4107,  2673,  1045,  2298,  2005,  1999,
          1037,  2236, 18742,  1012,  2002,  1005,  1055,  3835,  1998,  3733,
          2000,  2831,  2000,  2302,  2108,  9161,  6026,  1025,  2002,  1005,
          1055,  2467,  2006,  2051,  1999,  3773,  2010,  5022,  1025,  2002,
          1005,  1055,  6989,  2007,  1037,  2327,  1011, 18624,  2902,  1006,
         27935,  1007,  2029,  2026,  3008,  2031,  4541,  2000,  2033,  2003,
          2200,  2590,  1999,  2553,  2242,  6433,  1998,  2017,  2342,  5970,
          1025,  1998,  2017,  2064,  2131,  6523,  7941,  2015,  2000,  2156,
         15744,  2302,  2383,  2000,  2156,  2032,  2034,  1012,  2428,  1010,
          2054,  2062,  2079,  2017,  2342,  1029,  1045,  1005,  1049,  3564,
          2182,  2667,  2000,  2228,  1997,  2151, 10821,  1045,  2031,  2055,
          2032,  1010,  2021,  1045,  1005,  1049,  2428,  5059,  1037,  8744,
          1012,   102,     0,     0,   

In [26]:
# but we want to do batch loading so
# a batch is the number of samples to pass through the model BEFORE its weights get updated. 
# This is important because it constraints memory to just the batch size. It will also allow the gpu to run in parallel, 
# which obviously gpus are good at

# to achieve this we use a dataloader
train_loader = DataLoader(train, batch_size=8)
next(iter(train_loader))['input_ids'].shape (8,512)

torch.Size([8, 512])

In [30]:
# and the model being called on a batch 
batch = next(iter(train_loader))
output = model(batch['input_ids'], attention_mask=batch['attention_mask'])
output.logits.shape # (8, 5) - 8 inference results with 5 probabilities

torch.Size([8, 5])

In [32]:
output.logits.softmax(1) # convert to actual probabilites

tensor([[0.2952, 0.1827, 0.1856, 0.1433, 0.1932],
        [0.3687, 0.1883, 0.1639, 0.1263, 0.1528],
        [0.1975, 0.1857, 0.1205, 0.1459, 0.3505],
        [0.3693, 0.1968, 0.1571, 0.1313, 0.1455],
        [0.3579, 0.1892, 0.1626, 0.1265, 0.1637],
        [0.1958, 0.1911, 0.1153, 0.1475, 0.3505],
        [0.1955, 0.1898, 0.1075, 0.1477, 0.3595],
        [0.3555, 0.1825, 0.1674, 0.1325, 0.1622]], grad_fn=<SoftmaxBackward0>)

In [33]:
# now let's make a loss function 
# since we're doing classification we'll use CrossEntropyLoss
loss_fn = torch.nn.CrossEntropyLoss()

# test it with a batch 
test = output.logits
out = batch['labels']
loss = loss_fn(test, out)
loss # so this is our loss

tensor(1.4253, grad_fn=<NllLossBackward0>)

In [40]:
# let's have a quick look at the model and move it to the gpu as well
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [35]:
# note that it's final layer (classifier) has a dimension 768 -> 5 i.e. 5 possible labels

In [36]:
loss.backward() # can only do this once - one round of backpropagation

In [37]:
# let's now make an optimize and define a training loop 
optim = torch.optim.Adam(model.parameters(), lr=1e2)

In [38]:
# training loop 
epochs = 5 

# init the loss function 
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(epochs):

    # 1. zero the optimizer gradients
    optim.zero_grad()

    # 2. get a batch 
    batch = next(iter(train_loader))

    # 3. get some outputs (and the labels)
    outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'])
    labels = batch['labels']

    # 4. Calculate the loss and do back propagation
    loss = loss_fn(outputs.logits, labels)
    loss.backward() 
    
    # 5. Step the optimizer
    optim.step()

    # 6. Do it all again 
    print(f"Epoch {epoch+1} - Loss: {loss.item()}")


Epoch 1 - Loss: 1.4253073930740356
Epoch 2 - Loss: 22801625088.0
Epoch 3 - Loss: 356660150272.0
Epoch 4 - Loss: 95742312448.0
Epoch 5 - Loss: nan


And there we go. We've fine-tuned a huggingface model