In [None]:
# Install Huggingface libraries: transformers, datasets
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https:

## Data preprocessing

Before train the model, first do some basic data processing using tokenization library. 

In [None]:
from transformers import RobertaTokenizer
# Get the corresponding tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

First test the tokenizer using a simple sentence, after which print the special token and their IDs.

In [None]:
max_input_length = tokenizer.max_model_input_sizes['roberta-base']
print("Max input size: ", max_input_length)
print("-"*30)

tokens = tokenizer.tokenize("How are you?")
# Example of processing the sentence
print("Example of processing the sentence:")
print(tokens)
print(tokenizer.convert_tokens_to_ids(tokens))
print("-"*30)

# Special tokens
print("Special tokens")
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(init_token, eos_token, pad_token, unk_token)
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

Max input size:  512
------------------------------
Example of processing the sentence:
['How', 'Ġare', 'Ġyou', '?']
[6179, 32, 47, 116]
------------------------------
Special tokens
<s> </s> <pad> <unk>
0 2 1 3


Then, we can process our dataset "ag_news". Here we use the function "map" to numericalize tokens automatically.

In [None]:
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
from datasets import load_dataset
# load the ag_news dataset
dataset = load_dataset("ag_news")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

We can also remove the real text to make our tokenized_dataset more efficient to be accessed.

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

Since the origianl dataset is too large, it will cost a lot of time for us to train the model. We can randomly select 10000 samples as our train dataset and 2000 samples as our test dataset.

In [None]:
train_dataset = tokenized_datasets["train"].shuffle(seed=123).select(range(10000))
test_dataset = tokenized_datasets["test"].shuffle(seed=123).select(range(2000))

#train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
#test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

Print the information of dataset to make sure it is what we want.

In [None]:
print(tokenized_datasets)
print(train_dataset)
print(test_dataset)
#print(train_data["text"][0])
#print(train_data["label"][0])

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 7600
    })
})
Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 10000
})
Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 2000
})


Finally, we will set up the data-loader with batch size of 8.

In [None]:
from torch.utils.data import DataLoader
# Here we shuffle our train dataloader
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(test_dataset, batch_size=8)

## Model building

Now we will load our pretrained RoBERTa model.

Label 0: World

Label 1: Sports

Label 2: Business

Label 3: Sci/Tech

In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Choose the optimizer and set the learning rate.

In [None]:
from torch.optim import AdamW
# Set the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler
# Set the epoch number
num_epochs = 3
# Set the learning rate
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

## Train and evaluate the Model


In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Here we choose the cross entropy loss
criterion = torch.nn.CrossEntropyLoss()
# Put into device
model.to(device)
criterion = criterion.to(device)


Define the following functions: 
* Calculating accuracy
* Training for a single epoch
* Evaluating for a single epoch

In [None]:
import evaluate as f1_evaluate

# Accuarcy calculator
def calcuate_accuracy(preds, labels):
  idx_max = torch.argmax(preds, dim=-1)
  n_correct = (idx_max==labels).sum().item()
  return n_correct


# train for one epoch
def train(model, iterator, optimizer, criterion):
  num_correct = 0
  num_total = 0
  epoch_loss = 0
  model.train()
  for batch in iterator:

    labels = batch["labels"].to(device)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    y_preds = model(input_ids, attention_mask).logits
    loss = criterion(y_preds, labels)

    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()

    num_correct += calcuate_accuracy(y_preds, labels)
    num_total += labels.size(0)
    epoch_loss += loss.item()

  epoch_train_loss = epoch_loss/num_total
  epoch_train_acc = (num_correct*100)/num_total

  return epoch_train_loss, epoch_train_acc


# Evaluate for one epoch
def evaluating(model, iterator, criterion):
  num_correct = 0
  num_total = 0
  epoch_loss = 0
  model.eval()
  preds_all = []
  label_all = []
  f1_metric = f1_evaluate.load("f1")
  with torch.no_grad():
    for batch in iterator:

      labels = batch["labels"].to(device)
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)

      y_preds = model(input_ids, attention_mask).logits
      loss = criterion(y_preds, labels)

      num_correct += calcuate_accuracy(y_preds, labels)
      num_total += labels.size(0)
      epoch_loss += loss.item()

      preds_all.extend(torch.argmax(y_preds, dim=-1))
      label_all.extend(labels)

    epoch_eval_loss = epoch_loss/num_total
    epoch_eval_acc = (num_correct*100)/num_total
    f1_res = f1_metric.compute(predictions=preds_all, 
                  references=label_all, 
                  average="micro")

  return epoch_eval_loss, epoch_eval_acc, f1_res

In [None]:
for epoch in range(num_epochs):

  epoch_train_loss, epoch_train_acc = train(model, train_dataloader, optimizer, criterion)
  epoch_eval_loss, epoch_eval_acc, f1_res = evaluating(model, eval_dataloader, criterion)
  print("Epoch: ", epoch)
  print(f'\tTrain Loss: {epoch_train_loss:.5f} | Train Acc: {epoch_train_acc:.2f}%')
  print(f'\t Val. Loss: {epoch_eval_loss:.5f} |  Val. Acc: {epoch_eval_acc:.2f}%')
  print(f'\t F1. Score: {f1_res["f1"]:.5f}')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Epoch:  0
	Train Loss: 0.04842 | Train Acc: 87.41%
	 Val. Loss: 0.04258 |  Val. Acc: 89.70%
	 F1. Score: 0.89700
Epoch:  1
	Train Loss: 0.03061 | Train Acc: 92.35%
	 Val. Loss: 0.03402 |  Val. Acc: 91.30%
	 F1. Score: 0.91300
Epoch:  2
	Train Loss: 0.01856 | Train Acc: 95.24%
	 Val. Loss: 0.03182 |  Val. Acc: 91.90%
	 F1. Score: 0.91900


## Optional: Finetune last 2 transformer layers

First, let's print out all the layers. Since RoBERTa has 12 transformers, roberta.encoder.layer.10-11 are the last two transformer layers. So, we need to freeze other layers when training.

In [None]:
for name, param in model.named_parameters():
  print(name)

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

Let's define a new model named "model_2" as we did in previous code.

In [None]:
model_2 = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
optimizer = AdamW(model_2.parameters(), lr=5e-5)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
criterion = torch.nn.CrossEntropyLoss()
model_2.to(device)
criterion = criterion.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Here we freeze all the layers except the last 2 transformer layers and the classifier layers.

In [None]:
for name, param in model_2.named_parameters():
  if "classifier" not in name:
    if "roberta.encoder.layer.10" not in name:
      if "roberta.encoder.layer.11" not in name:
        param.requires_grad = False

Print out the layers needed to be trained. Make sure it is correct.

In [None]:
for name, param in model_2.named_parameters():
  if param.requires_grad:
    print(name)

roberta.encoder.layer.10.attention.self.query.weight
roberta.encoder.layer.10.attention.self.query.bias
roberta.encoder.layer.10.attention.self.key.weight
roberta.encoder.layer.10.attention.self.key.bias
roberta.encoder.layer.10.attention.self.value.weight
roberta.encoder.layer.10.attention.self.value.bias
roberta.encoder.layer.10.attention.output.dense.weight
roberta.encoder.layer.10.attention.output.dense.bias
roberta.encoder.layer.10.attention.output.LayerNorm.weight
roberta.encoder.layer.10.attention.output.LayerNorm.bias
roberta.encoder.layer.10.intermediate.dense.weight
roberta.encoder.layer.10.intermediate.dense.bias
roberta.encoder.layer.10.output.dense.weight
roberta.encoder.layer.10.output.dense.bias
roberta.encoder.layer.10.output.LayerNorm.weight
roberta.encoder.layer.10.output.LayerNorm.bias
roberta.encoder.layer.11.attention.self.query.weight
roberta.encoder.layer.11.attention.self.query.bias
roberta.encoder.layer.11.attention.self.key.weight
roberta.encoder.layer.11.atte

Train the model_2

In [None]:
for epoch in range(num_epochs):

  epoch_train_loss, epoch_train_acc = train(model_2, train_dataloader, optimizer, criterion)
  epoch_eval_loss, epoch_eval_acc, f1_res = evaluating(model_2, eval_dataloader, criterion)
  print("Epoch: ", epoch)
  print(f'\tTrain Loss: {epoch_train_loss:.5f} | Train Acc: {epoch_train_acc:.2f}%')
  print(f'\t Val. Loss: {epoch_eval_loss:.5f} |  Val. Acc: {epoch_eval_acc:.2f}%')
  print(f'\t F1. Score: {f1_res["f1"]:.5f}')

Epoch:  0
	Train Loss: 0.04275 | Train Acc: 87.78%
	 Val. Loss: 0.03199 |  Val. Acc: 91.50%
	 F1. Score: 0.91500
Epoch:  1
	Train Loss: 0.02618 | Train Acc: 92.78%
	 Val. Loss: 0.02896 |  Val. Acc: 92.70%
	 F1. Score: 0.92700
Epoch:  2
	Train Loss: 0.02087 | Train Acc: 94.26%
	 Val. Loss: 0.02997 |  Val. Acc: 92.70%
	 F1. Score: 0.92700


## Test the Model with sentence

Write a method to process the single sentence

In [None]:
# Method for tokenize single sentence
def predict_sentiment(model, tokenizer, sentence):
  model.eval()
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:max_input_length-2]
  indexed = [0] + tokenizer.convert_tokens_to_ids(tokens) + [2]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(0)
  prediction = torch.argmax(model(tensor).logits, dim=-1)
  return prediction.item()

Try some sentences to get the prediction

In [None]:
res = predict_sentiment(model, tokenizer, "I like playing badminton.")
print("Label 0: World")
print("Label 1: Sports")
print("Label 2: Business")
print("Label 3: Sci/Tech")
print("- "*30)
print("The prediction label is ", res)

Label 0: World
Label 1: Sports
Label 2: Business
Label 3: Sci/Tech
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
The prediction label is  1


In [None]:
res = predict_sentiment(model, tokenizer, "Smartphones can access the Internet.")
print("The prediction label is ", res)

The prediction label is  3
