In [2]:
!pip install -q transformers datasets peft accelerate bitsandbytes sentencepiece


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, get_linear_schedule_with_warmup
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

from peft import LoraConfig, get_peft_model, PeftModel
from tqdm.auto import tqdm
import os


In [4]:
dataset = load_dataset("glue", "sst2")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(
        batch["sentence"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:

from torch.utils.data import Dataset

class TorchSST2(Dataset):
    def __init__(self, hf_dataset):
        self.data = hf_dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long),
        }

train_dataset = TorchSST2(dataset["train"])
val_dataset   = TorchSST2(dataset["validation"])


In [6]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [7]:
num_labels = 2

base_model = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=num_labels
)

base_model.config.pad_token_id = tokenizer.pad_token_id
base_model.to(device)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [8]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["c_attn", "c_fc", "c_proj"],
    task_type="SEQ_CLS",
)

lora_model = get_peft_model(base_model, lora_config)
lora_model.print_trainable_parameters()


trainable params: 1,181,184 || all params: 125,622,528 || trainable%: 0.9403




In [9]:
num_epochs = 2
learning_rate = 2e-4

optimizer = torch.optim.AdamW(lora_model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps,
)

lora_model.train()
lora_train_loss_hist = []

for epoch in range(num_epochs):
    epoch_loss = 0.0
    pbar = tqdm(train_loader, desc=f"LoRA Training Epoch {epoch+1}/{num_epochs}")

    for batch in pbar:
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        outputs = lora_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()
        pbar.set_postfix({"loss": loss.item()})

    avg_loss = epoch_loss / len(train_loader)
    lora_train_loss_hist.append(avg_loss)
    print(f"[LoRA] Epoch {epoch+1} avg loss: {avg_loss:.4f}")


LoRA Training Epoch 1/2:   0%|          | 0/4210 [00:00<?, ?it/s]

[LoRA] Epoch 1 avg loss: 0.3316


LoRA Training Epoch 2/2:   0%|          | 0/4210 [00:00<?, ?it/s]

[LoRA] Epoch 2 avg loss: 0.2182


In [10]:
def evaluate(model, loader, device):
    model.eval()
    preds, labels_all = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["labels"].to(device)

            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            ).logits

            pred = torch.argmax(logits, dim=-1)

            preds.extend(pred.cpu().tolist())
            labels_all.extend(labels.cpu().tolist())

    acc = accuracy_score(labels_all, preds)
    f1  = f1_score(labels_all, preds, average="weighted")
    return acc, f1


Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

[LoRA] Val accuracy: 0.9163, F1: 0.9163


In [11]:
lora_acc, lora_f1 = evaluate(lora_model, val_loader, device)
print(f"[LoRA] ACC = {lora_acc:.4f}, F1 = {lora_f1:.4f}")


Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

[LoRA] ACC = 0.9163, F1 = 0.9163


In [12]:
save_dir = "gpt2_sst2_lora_adapter"
os.makedirs(save_dir, exist_ok=True)

lora_model.save_pretrained(save_dir)
print("Saved to:", save_dir)


Saved to: gpt2_sst2_lora_adapter


In [13]:
base2 = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=num_labels
)
base2.config.pad_token_id = tokenizer.pad_token_id
base2.to(device)

imported_lora = PeftModel.from_pretrained(base2, save_dir)
imported_lora.to(device)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B):

In [14]:
imp_acc, imp_f1 = evaluate(imported_lora, val_loader, device)

print("\n--- Comparison ---")
print(f"LoRA Trained     : ACC={lora_acc:.4f}, F1={lora_f1:.4f}")
print(f"LoRA Imported    : ACC={imp_acc:.4f}, F1={imp_f1:.4f}")


Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]


--- Comparison ---
LoRA Trained     : ACC=0.9163, F1=0.9163
LoRA Imported    : ACC=0.9163, F1=0.9163
