In [None]:
import torch
torch.cuda.is_available()

False

# Fine-tune DistilBERT for Sentiment Analysis

In this notebook, we will fine-tune a pre-trained DistilBERT model to perform sentiment analysis on the IMDB dataset. We will:
- Prepare the dataset for training and evaluation
- Tokenize the dataset
- Fine-tune the DistilBERT model for sentiment analysis
- Evaluate the model on the test set
- Perform inference on custom sentences
- Compare the model with a non-fine-tuned baseline model

## Data Preparation
- Read the input data
- Split the data into training and testing data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset('imdb')

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the input
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

# Select a smaller subset of data for faster prototyping
train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Tokenize using multiple processes
train_dataset = train_dataset.map(tokenize, batched=True, num_proc=4)
test_dataset = test_dataset.map(tokenize, batched=True, num_proc=4)

# Set the format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
# num_labels = 2 for binary classification, it aligns with the number of labels in the IMDb dataset
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

## Model Training

- The `input_id`s provide the tokenized numerical representation of the input text, which the model uses to generate embeddings for each token. These embeddings are then used for further processing within the model.
- The `attention_mask` tells the model which tokens are actual input data and which tokens are padding. 

In [None]:
from tqdm import tqdm  # Import tqdm for progress bars

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        inputs = {key: value.to(device) for key, value in batch.items() if key in ['input_ids', 'attention_mask']}
        labels = batch['label'].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()    # Update the weights
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation loop
def evaluate(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            inputs = {key: value.to(device) for key, value in batch.items() if key in ['input_ids', 'attention_mask']}
            labels = batch['label'].to(device)
            outputs = model(**inputs)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total
    

# Train and evaluate the model
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Training Loss: {train_loss:.4f}")
    accuracy = evaluate(model, test_loader, device)
    print(f"Validation Accuracy: {accuracy:.2f}%")

# Save the model
# model.save_pretrained('./distilbert_imdb')
# tokenizer.save_pretrained('./distilbert_imdb')


Epoch 1/3


Training: 100%|██████████| 125/125 [07:29<00:00,  3.59s/it]


Training Loss: 0.4019


Evaluating: 100%|██████████| 125/125 [02:03<00:00,  1.01it/s]


Validation Accuracy: 86.30%
Epoch 2/3


Training: 100%|██████████| 125/125 [08:21<00:00,  4.01s/it]


Training Loss: 0.2038


Evaluating: 100%|██████████| 125/125 [02:01<00:00,  1.03it/s]


Validation Accuracy: 84.10%
Epoch 3/3


Training: 100%|██████████| 125/125 [09:18<00:00,  4.47s/it]


Training Loss: 0.1108


Evaluating: 100%|██████████| 125/125 [02:14<00:00,  1.07s/it]

Validation Accuracy: 85.40%





## Evaluation
Compare the performance of the model after fine-tuning with the performance of the model before fine-tuning. The model should perform better after fine-tuning because it has been trained on a dataset that is more similar to the target task.

In [None]:
# Evaluation function
def final_evaluate(model, test_dataset, device):
    model.eval()
    y_true = []
    y_pred = []
    inputs = {key: test_dataset[key].to(device) for key in ['input_ids', 'attention_mask']}
    labels = test_dataset['label'].to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        # Using argmax along the last dimension to get the predicted class
        predictions = torch.argmax(logits, dim=-1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())
    return y_true, y_pred

In [None]:
pretrained_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Evaluate the pre-trained model
y_true_pretrained, y_pred_pretrained = final_evaluate(pretrained_model, test_dataset, device)
accuracy_pretrained = accuracy_score(y_true_pretrained, y_pred_pretrained)
precision_pretrained, recall_pretrained, f1_pretrained, _ = precision_recall_fscore_support(y_true_pretrained, y_pred_pretrained, average='binary')
print(f"Pre-trained Model Accuracy: {accuracy_pretrained:.2f}")
print(f"Pre-trained Model Precision: {precision_pretrained:.2f}")
print(f"Pre-trained Model Recall: {recall_pretrained:.2f}")
print(f"Pre-trained Model F1 Score: {f1_pretrained:.2f}")

Pre-trained Model Accuracy: 0.44
Pre-trained Model Precision: 0.41
Pre-trained Model Recall: 0.37
Pre-trained Model F1 Score: 0.39


In [None]:
# Evaluate the fine-tuned model
y_true_finetuned, y_pred_finetuned = final_evaluate(model, test_dataset, device)
accuracy_finetuned = accuracy_score(y_true_finetuned, y_pred_finetuned)
precision_finetuned, recall_finetuned, f1_finetuned, _ = precision_recall_fscore_support(y_true_finetuned, y_pred_finetuned, average='binary')
print(f"Fine-tuned Model Accuracy: {accuracy_finetuned:.2f}")
print(f"Fine-tuned Model Precision: {precision_finetuned:.2f}")
print(f"Fine-tuned Model Recall: {recall_finetuned:.2f}")
print(f"Fine-tuned Model F1 Score: {f1_finetuned:.2f}")

Fine-tuned Model Accuracy: 0.85
Fine-tuned Model Precision: 0.86
Fine-tuned Model Recall: 0.84
Fine-tuned Model F1 Score: 0.85
