In [1]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from data.q_and_a.train_and_eval import TrainAndEval
from data.q_and_a.eval_with_answers import EvalWithAnswers
from q_and_a.prompts import prompt
from data.q_and_a.prompted import Prompted
import torch.optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader

# First, load the data

We are going to load the data used for train or modify our classification task.

In [2]:
class Tokenized(Dataset):
    def __init__(self, tokenizer, dataset: Prompted, max_length=2000):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx: int):
        text, answer = self.dataset[idx]

        result = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",)
        labels = torch.tensor(answer, dtype=torch.long)

        return {
            "input_ids": result["input_ids"].squeeze(0),
            "attention_mask": result["attention_mask"].squeeze(0),
            "labels": labels,
        }

In [3]:
MODEL_NAME = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.pad_token = tokenizer.eos_token

train_dataset = TrainAndEval("../../data/pubmed_QA_train.json")
test_dataset = TrainAndEval("../../data/pubmed_QA_eval.json")
train_with_answers = EvalWithAnswers(train_dataset)
test_with_answers = EvalWithAnswers(test_dataset)
train_prompted= Prompted(train_with_answers, prompt)
test_prompted = Prompted(test_with_answers, prompt)
train_tokenized = Tokenized(tokenizer, train_prompted)
test_tokenized = Tokenized(tokenizer, test_prompted)

In [4]:
len(train_tokenized), len(test_tokenized)

(16890, 5000)

In [5]:
# per now use a subset
from torch.utils.data import Subset

train_tokenized = Subset(train_tokenized, range(0, 2000))
test_tokenized = Subset(test_tokenized, range(0, 200))

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=4,
    load_in_8bit=True,
    pad_token_id=tokenizer.pad_token_id,
)
model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128001)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear8bitLt(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)


In [7]:
for name, param in model.named_parameters():
    if "score" not in name:
        print(name)
        param.requires_grad = False
    else:
        print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.k_proj.weight
model.layers.2.self_attn.v_proj.weight
model.layers.2.self_attn.o_proj.weight
model.layers.2.mlp.gate_proj.weight
model.layers.2.mlp.up_proj.weight
model.layers.2.mlp.down_proj.weight
model.layers.2.inp

In [8]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,  # Rank of LoRA matrices (lower = less memory)
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Depends on model architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,674,560 || trainable%: 0.0689


In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  arr = np.array(obj)
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss
1,1.2486,0.737786
2,0.3246,0.202101
3,0.1981,0.307587


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


KeyboardInterrupt: 

In [18]:
print("CUDA available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))
print("Supports FP16:", torch.cuda.get_device_capability(0))

CUDA available: True
GPU Name: NVIDIA A10G
Supports FP16: (8, 6)
