Fine tune bert model for sentiment analysis

In [1]:
import sys
import os

# Add parent directory to Python path
sys.path.append(os.path.abspath(".."))

In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
from utils.load_data import load_data

train_ds, test_ds = load_data()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

# model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2) # Fine tune model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
label_map = {
    0: "negative",  
    1: "positive"
}

In [6]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True, max_length=256)

In [20]:
tokenized_train_ds = train_ds.map(tokenize_function)
print(tokenized_train_ds)

Map: 100%|██████████| 25000/25000 [00:18<00:00, 1347.07 examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})





In [21]:
tokenized_test_ds = train_ds.map(tokenize_function)
print(tokenized_test_ds)

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})


In [13]:
from transformers import TrainingArguments, Trainer
from evaluate import load

In [14]:
training_args = TrainingArguments(
    output_dir="./results",          # Model checkpoints
    eval_strategy="epoch",           # Evaluate at the end of each epoch
    save_strategy="epoch",
    learning_rate=2e-5,              # Start with a small learning rate
    per_device_train_batch_size=8,   # Batch size per GPU
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,               # Regularization
    load_best_model_at_end=True,     # Automatically load the best checkpoint
    logging_dir="./logs",            # Directory for logs
    logging_steps=100,
    fp16=True                        # Enable mixed precision weight
)

In [15]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.57.1', '1.12.0')

In [16]:
metric = load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="micro")

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    processing_class=tokenizer,
    # data_collator=data_collator, # Efficient batching
    compute_metrics=compute_metrics # Custom metric
)

Freeze layer

In [43]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [44]:
# Freeze lower layer to perserve defualt knowledge of bert and unfreeze upper layer to fine tune for specifice task
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze the last 3 encoder layers and the classifier
for name, param in model.named_parameters():
    if 'encoder.layer.9.' in name or 'encoder.layer.10.' in name or 'encoder.layer.11.' in name:
        param.requires_grad = True
    if 'classifier' in name:
        param.requires_grad = True

In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2825,0.221815,0.92476
2,0.266,0.174361,0.94332
3,0.2066,0.147323,0.95868
4,0.1754,0.113159,0.97024
5,0.1748,0.092694,0.9778
6,0.0985,0.084391,0.98024


TrainOutput(global_step=18750, training_loss=0.21131870274861653, metrics={'train_runtime': 13261.8512, 'train_samples_per_second': 11.311, 'train_steps_per_second': 1.414, 'total_flos': 1.9733329152e+16, 'train_loss': 0.21131870274861653, 'epoch': 6.0})

In [46]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.08439058810472488, 'eval_f1': 0.98024, 'eval_runtime': 70.6767, 'eval_samples_per_second': 353.723, 'eval_steps_per_second': 44.215, 'epoch': 6.0}


In [None]:
save_path = "./output"
model.save_pretrained(save_path)

In [34]:
fine_tune_model = AutoModelForSequenceClassification.from_pretrained("./output")
print(fine_tune_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Testing
test_text = "This movie is interesting."
encode_test_text = tokenizer(test_text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
logits = fine_tune_model(**encode_test_text).logits
print(logits)

probabilities = torch.softmax(logits, dim=1)
print(probabilities)

predicted_index = torch.argmax(probabilities, dim=1)
print(predicted_index)

tensor([[ 3.7109, -3.0295]], grad_fn=<AddmmBackward0>)
tensor([[0.9988, 0.0012]], grad_fn=<SoftmaxBackward0>)
tensor([0])
