In [1]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from data.q_and_a.train_and_eval import TrainAndEval
from data.q_and_a.eval_with_answers import EvalWithAnswers
from q_and_a.prompts import prompt
from data.q_and_a.prompted import Prompted
import torch.optim

In [2]:
from huggingface_hub import login
login("{}")


# First, load the data

We are going to load the data used for train or modify our classification task.

In [3]:
class Tokenized(Dataset):
    def __init__(self, tokenizer, dataset: Prompted, max_length=2000):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx: int):
        text, answer = self.dataset[idx]

        result = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",)
        labels = torch.tensor(answer, dtype=torch.long)

        return {
            "input_ids": result["input_ids"].squeeze(0),
            "attention_mask": result["attention_mask"].squeeze(0),
            "labels": labels,
        }

In [4]:
MODEL_NAME = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.pad_token = tokenizer.eos_token

train_dataset = TrainAndEval("../../data/pubmed_QA_train.json")
test_dataset = TrainAndEval("../../data/pubmed_QA_eval.json")
train_with_answers = EvalWithAnswers(train_dataset)
test_with_answers = EvalWithAnswers(test_dataset)
train_prompted= Prompted(train_with_answers, prompt)
test_prompted = Prompted(test_with_answers, prompt)
train_tokenized = Tokenized(tokenizer, train_prompted)
test_tokenized = Tokenized(tokenizer, test_prompted)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [5]:
len(train_tokenized), len(test_tokenized)

(16890, 5000)

In [6]:
# per now use a subset
from torch.utils.data import Subset

train_tokenized = Subset(train_tokenized, range(0, 2000))
test_tokenized = Subset(test_tokenized, range(0, 200))

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=4,
    load_in_8bit=True,
    pad_token_id=tokenizer.pad_token_id,
)
model

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128001)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear8bitLt(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)


In [8]:
for name, param in model.named_parameters():
    if "score" not in name:
        print(f"grad non required on:{name}")
        param.requires_grad = False
    else:
        print(f"requires grad: {name}")

grad non required on:model.embed_tokens.weight
grad non required on:model.layers.0.self_attn.q_proj.weight
grad non required on:model.layers.0.self_attn.k_proj.weight
grad non required on:model.layers.0.self_attn.v_proj.weight
grad non required on:model.layers.0.self_attn.o_proj.weight
grad non required on:model.layers.0.mlp.gate_proj.weight
grad non required on:model.layers.0.mlp.up_proj.weight
grad non required on:model.layers.0.mlp.down_proj.weight
grad non required on:model.layers.0.input_layernorm.weight
grad non required on:model.layers.0.post_attention_layernorm.weight
grad non required on:model.layers.1.self_attn.q_proj.weight
grad non required on:model.layers.1.self_attn.k_proj.weight
grad non required on:model.layers.1.self_attn.v_proj.weight
grad non required on:model.layers.1.self_attn.o_proj.weight
grad non required on:model.layers.1.mlp.gate_proj.weight
grad non required on:model.layers.1.mlp.up_proj.weight
grad non required on:model.layers.1.mlp.down_proj.weight
grad non

In [9]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,  # Rank of LoRA matrices (lower = less memory)
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Depends on model architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.TOKEN_CLS
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,674,560 || trainable%: 0.0689


In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.3692,1.189048
2,0.6726,0.304291
3,0.1976,0.24675
4,0.1628,0.233334
5,0.1636,0.175298
6,0.1493,0.100509
7,0.1544,0.098871
8,0.1051,0.193692
9,0.1184,0.189326
10,0.1191,0.099614


TrainOutput(global_step=3750, training_loss=0.24662663777669272, metrics={'train_runtime': 1954.5116, 'train_samples_per_second': 15.349, 'train_steps_per_second': 1.919, 'total_flos': 3.5064225792e+17, 'train_loss': 0.24662663777669272, 'epoch': 15.0})

In [11]:
print("CUDA available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))
print("Supports FP16:", torch.cuda.get_device_capability(0))

CUDA available: True
GPU Name: NVIDIA H100 80GB HBM3
Supports FP16: (9, 0)


In [12]:
trainer.save_model("./last-checkpoint/trainer")
tokenizer.save_pretrained("./last-checkpoint/tokenizer")

('./last-checkpoint/tokenizer/tokenizer_config.json',
 './last-checkpoint/tokenizer/special_tokens_map.json',
 './last-checkpoint/tokenizer/tokenizer.json')

In [13]:
model.save_pretrained("./last-checkpoint/model")

In [14]:
import torch
from pathlib import Path

from transformers import AutoModelForSequenceClassification,AutoTokenizer

from data.q_and_a.train_and_eval import TrainAndEval
from data.q_and_a.eval_with_answers import EvalWithAnswers

from models_.building.llama_tokenizer import  load_tokenizer

from data.pubmed.from_json import FromJsonDataset
from data.pubmed.contents import ContentsDataset

from storage.faiss_ import FaissStorage

from rag.tokenization.llama import build_tokenizer_function
from rag.quering import build_querier
import os
from q_and_a.forward import build_enhanced_forwarder
from q_and_a.prompts import prompt
from q_and_a.picking.from_logits import build_from_logits
from q_and_a.eval import evaluate
from q_and_a.forward import build_forwarder

train = TrainAndEval("../../data/pubmed_QA_train.json")
evaluationData = TrainAndEval("../../data/pubmed_QA_eval.json")
evaluateWithAnswers = EvalWithAnswers(evaluationData)

augmented_data = FromJsonDataset(json_file="../../data/pubmed_500K.json")
augmented_data = ContentsDataset(augmented_data)

from huggingface_hub import notebook_login
notebook_login()

storage = FaissStorage(
    dimension=800,
)

storage.load("../../outputs/store/pubmed_500K.index")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
tokenizer_rag = load_tokenizer()
tokenizer_fn = build_tokenizer_function(tokenizer_rag)

querier = build_querier(storage, augmented_data, tokenizer_fn)
storage = FaissStorage(
    dimension=800,
)

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
model.eval()
forward = build_forwarder(
    model,
    tokenizer,
    querier,
    k_augmentations=1,
    prompt_builder=prompt,
    device=device,
)

forward_and_get_arg_max = lambda question, options: forward(
    question,
    options=options,
)

def pick_from_classifier(out):
    return torch.argmax(out.logits[0])

accuracy = evaluate(
    forward_fn=forward_and_get_arg_max,
    picker_fn=pick_from_classifier,
    eval_dataset=evaluateWithAnswers,
)

print(f"Accuracy: {accuracy:.2f}")

Right answer: 1, picked: 1
Accuracy at 0: 1.00
Right answer: 0, picked: 0
Right answer: 0, picked: 0
Right answer: 0, picked: 3
Right answer: 2, picked: 2
Right answer: 3, picked: 3
Right answer: 0, picked: 0
Right answer: 1, picked: 1
Right answer: 1, picked: 1
Right answer: 2, picked: 2
Right answer: 1, picked: 1
Accuracy at 100: 0.91
Right answer: 3, picked: 3
Right answer: 2, picked: 2
Right answer: 0, picked: 3
Right answer: 2, picked: 2
Right answer: 3, picked: 3
Right answer: 1, picked: 1
Right answer: 1, picked: 1
Right answer: 0, picked: 3
Right answer: 2, picked: 2
Right answer: 1, picked: 1
Accuracy at 200: 0.94
Right answer: 1, picked: 1
Right answer: 1, picked: 1
Right answer: 0, picked: 3
Right answer: 1, picked: 1
Right answer: 3, picked: 3
Right answer: 1, picked: 1
Right answer: 0, picked: 0
Right answer: 0, picked: 0
Right answer: 0, picked: 3
Right answer: 2, picked: 2
Accuracy at 300: 0.93
Right answer: 1, picked: 1
Right answer: 2, picked: 2
Right answer: 2, picked

KeyboardInterrupt: 

In [19]:
from q_and_a.predict import predict
from data.q_and_a.test_questions import TestQuestions

In [20]:
test_data = TestQuestions("../../data/pubmed_QA_test_questions.json")

In [21]:
responses = predict(
    forward_fn=forward_and_get_arg_max,
    picker_fn=pick_from_classifier,
    eval_dataset=test_data,
)

responses

Processed 0.1%
Processed 0.2%
Processed 0.3%
Processed 0.4%
Processed 0.5%
Processed 0.6%
Processed 0.7%
Processed 0.8%
Processed 0.9%


[(0, tensor(1, device='cuda:0')),
 (1, tensor(3, device='cuda:0')),
 (2, tensor(2, device='cuda:0')),
 (3, tensor(1, device='cuda:0')),
 (4, tensor(2, device='cuda:0')),
 (5, tensor(1, device='cuda:0')),
 (6, tensor(2, device='cuda:0')),
 (7, tensor(2, device='cuda:0')),
 (8, tensor(1, device='cuda:0')),
 (9, tensor(0, device='cuda:0')),
 (10, tensor(3, device='cuda:0')),
 (11, tensor(3, device='cuda:0')),
 (12, tensor(2, device='cuda:0')),
 (13, tensor(3, device='cuda:0')),
 (14, tensor(3, device='cuda:0')),
 (15, tensor(1, device='cuda:0')),
 (16, tensor(3, device='cuda:0')),
 (17, tensor(2, device='cuda:0')),
 (18, tensor(0, device='cuda:0')),
 (19, tensor(2, device='cuda:0')),
 (20, tensor(1, device='cuda:0')),
 (21, tensor(3, device='cuda:0')),
 (22, tensor(2, device='cuda:0')),
 (23, tensor(1, device='cuda:0')),
 (24, tensor(1, device='cuda:0')),
 (25, tensor(1, device='cuda:0')),
 (26, tensor(3, device='cuda:0')),
 (27, tensor(0, device='cuda:0')),
 (28, tensor(2, device='cuda:0

In [22]:
import pandas as pd

In [27]:
responses_with_ids = []

for i in range(len(responses)):
    responses_with_ids.append((test_data[i]["id"], responses[i][1].item()))

In [28]:
dataset = pd.DataFrame(responses_with_ids, columns=["ID", "answer"])

In [29]:
dataset.head()

Unnamed: 0,ID,answer
0,26,1
1,29,3
2,37,2
3,70,1
4,109,2


In [30]:
dataset.to_csv("predictions.csv", index=False)