In [1]:
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import BertTokenizer, BertModel, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from tqdm.notebook import tqdm

In [2]:
# Define a new model which adds an additional layer on top of BertForSequenceClassification
class BertForSequenceClassificationCustom(nn.Module):
    def __init__(self, num_labels=2):
        super(BertForSequenceClassificationCustom, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        # Adding an additional custom layer after BERT output
        self.classifier = nn.Sequential(
            nn.Linear(768, 512),  # 768 is the size of BERT's hidden representation, adjust if using a different model
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # We are interested in BERT's pooled output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    return total_eval_accuracy / len(dataloader)

# Function for prediction on example sentences
def predict_on_example(model, tokenizer, sentence1, sentence2, device):
    model.eval()  # Put the model in evaluation mode
    inputs = tokenizer(sentence1, sentence2, return_tensors="pt", max_length=128, truncation=True, padding='max_length')
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the correct device
    with torch.no_grad():
        logits = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    probabilities = torch.softmax(logits, dim=1)
    prediction = torch.argmax(probabilities, dim=1)
    return probabilities, prediction.item()

# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and initialize custom model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassificationCustom(num_labels=2)
model.to(device)  # Move model to the device

# Load and preprocess the MRPC dataset
dataset = load_dataset('glue', 'mrpc')
texts = [(tokenizer(example['sentence1'], example['sentence2'], truncation=True, padding='max_length', max_length=128), example['label']) for example in dataset['train']]
input_ids = torch.tensor([t[0]['input_ids'] for t in texts])
attention_masks = torch.tensor([t[0]['attention_mask'] for t in texts])
labels = torch.tensor([t[1] for t in texts])

# Split the dataset into training and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=32)

# Evaluate the model before fine-tuning
pre_fine_tune_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Accuracy before fine-tuning: {pre_fine_tune_accuracy:.4f}')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  train_inputs = torch.tensor(train_inputs)
  validation_inputs = torch.tensor(validation_inputs)
  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(validation_labels)
  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)


Accuracy before fine-tuning: 0.6467


In [3]:
# Setup the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 5  # Increase number of epochs for more fine-tuning steps

# Total number of training steps is the number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # Default value
    num_training_steps=total_steps
)

# Example sentences for prediction before fine-tuning
sentence1 = "The company reported better than expected results."
sentence2 = "The firm's results exceeded forecasts."
probabilities, prediction = predict_on_example(model, tokenizer, sentence1, sentence2, device)
print(f'Prediction before fine-tuning: {prediction}, Probabilities: {probabilities}')

# Fine-tune the model
model.train()
for epoch in tqdm(range(epochs)):
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        loss = nn.CrossEntropyLoss()(logits, b_labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

# Example sentences for prediction after fine-tuning
probabilities, prediction = predict_on_example(model, tokenizer, sentence1, sentence2, device)
print(f'Prediction after fine-tuning: {prediction}, Probabilities: {probabilities}')

# Evaluate the model after fine-tuning
post_fine_tune_accuracy = evaluate_model(model, validation_dataloader, device)
print(f'Accuracy after fine-tuning: {post_fine_tune_accuracy:.4f}')

Prediction before fine-tuning: 1, Probabilities: tensor([[0.4777, 0.5223]], device='cuda:0')




  0%|          | 0/5 [00:00<?, ?it/s]

Prediction after fine-tuning: 0, Probabilities: tensor([[0.9665, 0.0335]], device='cuda:0')
Accuracy after fine-tuning: 0.8193


## Fine tuning by unfreezeing 3 layers

In [8]:
def unfreeze_and_train(model, train_dataloader, validation_dataloader, device, epochs=5):
    # First, freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last three layers
    for layer in [model.bert.encoder.layer[-1], model.bert.encoder.layer[-2], model.bert.encoder.layer[-3]]:
        for param in layer.parameters():
            param.requires_grad = True

    # Setup the optimizer for the unfrozen parameters
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, eps=1e-8)

    # Recalculate the number of steps and prepare the scheduler again since we changed the training parameters
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Fine-tune the model
    model.train()
    for epoch in tqdm(range(epochs), desc="Epochs"):
        total_loss = 0
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            model.zero_grad()

            logits = model(b_input_ids, attention_mask=b_input_mask)
            loss = nn.CrossEntropyLoss()(logits, b_labels)
            
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

    print("Finished fine-tuning.")

    # Evaluate the model after fine-tuning
    post_fine_tune_accuracy = evaluate_model(model, validation_dataloader, device)
    print(f'Accuracy after fine-tuning: {post_fine_tune_accuracy:.4f}')



In [9]:
# Assuming you have defined 'model', 'train_dataloader', 'validation_dataloader', and 'device' as per your notebook setup
epochs = 5  # You can adjust the number of epochs based on your requirement

# Call the function to unfreeze the last three layers and fine-tune the model
unfreeze_and_train(model, train_dataloader, validation_dataloader, device, epochs=epochs)


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Average training loss: 0.0926
Average training loss: 0.0790
Average training loss: 0.0694
Average training loss: 0.0662
Average training loss: 0.0608
Finished fine-tuning.
Accuracy after fine-tuning: 0.8141
