In [1]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from data.q_and_a.train_and_eval import TrainAndEval
from data.q_and_a.eval_with_answers import EvalWithAnswers
from q_and_a.prompts import prompt
from data.q_and_a.prompted import Prompted
import torch.optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader

# First, load the data

We are going to load the data used for train or modify our classification task.

In [2]:
class Tokenized(Dataset):
    def __init__(self, tokenizer, dataset: Prompted, max_length=2000):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx: int):
        text, answer = self.dataset[idx]

        result = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",)
        labels = torch.tensor(answer, dtype=torch.long)

        return {
            "input_ids": result["input_ids"].squeeze(0),
            "attention_mask": result["attention_mask"].squeeze(0),
            "labels": labels,
        }

In [3]:
MODEL_NAME = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.pad_token = tokenizer.eos_token

train_dataset = TrainAndEval("../../data/pubmed_QA_train.json")
test_dataset = TrainAndEval("../../data/pubmed_QA_eval.json")
train_with_answers = EvalWithAnswers(train_dataset)
test_with_answers = EvalWithAnswers(test_dataset)
train_prompted= Prompted(train_with_answers, prompt)
test_prompted = Prompted(test_with_answers, prompt)
train_tokenized = Tokenized(tokenizer, train_prompted)
test_tokenized = Tokenized(tokenizer, test_prompted)

In [4]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_tokenized, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_tokenized, batch_size=8, shuffle=False)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=4,
    torch_dtype =torch.float16,
    pad_token_id=tokenizer.pad_token_id,
)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Lets fine tune

In [6]:
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

In [7]:
def train(model, dataloader, optimizer, device):
    model.train()
    loss_fn = CrossEntropyLoss()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy


In [8]:
def evaluate(model, dataloader, device):
    model.eval()
    loss_fn = CrossEntropyLoss()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy


In [9]:
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

train_loader = DataLoader(train_tokenized, batch_size=8, shuffle=True)
eval_loader = DataLoader(test_tokenized, batch_size=8)

optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(1, 4):
    print(f"\nEpoch {epoch}")
    train_loss, train_acc = train(model, train_loader, optimizer, device)
    eval_loss, eval_acc = evaluate(model, eval_loader, device)

    print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"Eval  Loss: {eval_loss:.4f}, Accuracy: {eval_acc:.4f}")



Epoch 1


Training:   0%|          | 0/2112 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 21.98 GiB of which 244.44 MiB is free. Including non-PyTorch memory, this process has 21.73 GiB memory in use. Of the allocated memory 21.27 GiB is allocated by PyTorch, and 168.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)