In [1]:
!pip install datasets
!pip install peft

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [2]:
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification, BertTokenizer, BertModel, BertConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PrefixTuningConfig, TaskType
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType

In [25]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_CLS, num_virtual_tokens=10)


In [26]:
tokenizer = AutoTokenizer.from_pretrained('google/bert_uncased_L-8_H-512_A-8')
dataset = load_dataset('glue', 'sst2')

In [27]:
model = AutoModelForSequenceClassification.from_pretrained('google/bert_uncased_L-8_H-512_A-8')
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-8_H-512_A-8 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 82,946 || all params: 41,457,156 || trainable%: 0.2000764355374498


In [28]:
# Tokenize and preprocess the dataset
def preprocess(example):
    encoded = tokenizer.encode_plus(
        example['sentence'],
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    label = torch.tensor(example['label'], dtype=torch.long)

    return {
        'input_ids': encoded['input_ids'][0],
        'attention_mask': encoded['attention_mask'][0],
        'labels': label
    }

dataset = dataset.map(preprocess, batched=False)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [29]:
def collate_fn(batch):
    input_ids = pad_sequence([torch.tensor(item['input_ids'], dtype=torch.long) for item in batch],
                             batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch],
                                  batch_first=True, padding_value=0)
    # Ensure labels are converted to tensors before stacking
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Split the dataset into train, validation, and test sets
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

# Fine-tune the BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.1)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 5
total_steps = len(train_loader) * epochs



ValueError: ignored

In [30]:
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def count_all_params(model):
    return sum(p.numel() for p in model.parameters())

print(f"Total parameters: {count_all_params(model)}")
print(f"Trainable parameters: {count_trainable_params(model)}")

Total parameters: 41457156
Trainable parameters: 82946


In [None]:
model.train()
for epoch in range(epochs):
    total_loss = 0
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{epochs}", position=0, leave=True, ncols=80) as progress_bar:
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            loss.backward()


            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix_str(f"Loss: {loss.item()}")
            progress_bar.update()

    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, predicted = torch.max(logits, 1)
            #print(predicted, labels.squeeze())
            total_correct += (predicted == labels.squeeze()).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {avg_loss}")

Epoch 1/5: 100%|██| 2105/2105 [05:14<00:00,  6.69it/s, Loss: 1.1346075534820557]


Test Accuracy: 49.08%

Epoch 1/5 - Average Loss: 1.048710484643447


Epoch 2/5: 100%|██| 2105/2105 [05:04<00:00,  6.92it/s, Loss: 0.4252570569515228]


Test Accuracy: 82.45%

Epoch 2/5 - Average Loss: 0.6349461120745773


Epoch 3/5: 100%|██| 2105/2105 [05:06<00:00,  6.87it/s, Loss: 0.6822689175605774]


Test Accuracy: 82.22%

Epoch 3/5 - Average Loss: 0.41679327246553827


Epoch 4/5:  41%|█▏ | 855/2105 [02:04<03:00,  6.91it/s, Loss: 0.5106778740882874]

In [23]:
# Evaluate the model on the test set



Test Accuracy: 49.08%
