In [1]:
!pip install transformers==4.51.3
!pip install sentence-transformers==4.1.0
!pip install einops
!pip install 'accelerate>=0.26.0'
!pip install bitsandbytes



In [2]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
#from dotenv import load_dotenv
#load_dotenv()

model_name = "meta-llama/Llama-3.2-1B"

# Check HF_TOKEN environment variable
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    print("Hugging Face token found in environment variable.")
else:
    token = input("Enter your Hugging Face token: ")
    login(token=token) # Token de Hugging Face

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

In [4]:
base_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

# Configurar tokenizador
base_tokenizer.pad_token = base_tokenizer.eos_token
base_tokenizer.padding_side = "right"

In [5]:
import torch.nn as nn

def find_all_linear_names(model):
    cls = nn.Linear  # Use standard Linear layer
    linear_module_names = set()
    
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            linear_module_names.add(names[0] if len(names) == 1 else names[-1])
    
    if 'lm_head' in linear_module_names:  # Optionally exclude output head
        linear_module_names.remove('lm_head')
    
    return list(linear_module_names)

modules = find_all_linear_names(base_model)
modules

['gate_proj', 'k_proj', 'up_proj', 'v_proj', 'down_proj', 'o_proj', 'q_proj']

In [6]:
from datasets import DatasetDict,load_dataset
training_data = load_dataset("juan-carvajal/maia-pln-2025-training")

In [7]:
from peft import get_peft_model, LoraConfig, TaskType
import os
os.environ["WANDB_DISABLED"] = "true"

lora_config = LoraConfig(
    r=8,  # Rank of LoRA matrices (lower = less memory)
    lora_alpha=16,
    target_modules=modules,
    lora_dropout=0.3,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    modules_to_save=["score"],
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_dir="./logs",
    fp16=True,
    save_total_limit=1,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data["train"].select(range(0, 500)),
    eval_dataset=training_data["eval"].select(range(0, 50)),
    tokenizer=base_tokenizer,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 5,636,096 || all params: 1,241,450,496 || trainable%: 0.4540


OutOfMemoryError: CUDA out of memory. Tried to allocate 126.00 MiB. GPU 0 has a total capacity of 5.65 GiB of which 80.88 MiB is free. Process 79484 has 3.75 GiB memory in use. Including non-PyTorch memory, this process has 1.79 GiB memory in use. Of the allocated memory 1.61 GiB is allocated by PyTorch, and 77.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
trainer.save_model("./trainer-combined-logits-full-hd/trainer")
base_tokenizer.save_pretrained("./trainer-combined-logits-full-hd/tokenizer")

('./trainer-combined-logits-full-hd/tokenizer/tokenizer_config.json',
 './trainer-combined-logits-full-hd/tokenizer/special_tokens_map.json',
 './trainer-combined-logits-full-hd/tokenizer/tokenizer.json')

In [9]:
from transformers import pipeline
import numpy as np

def generate_prompt(
    question: str,
    context: str,
    options: list[str]
) -> str:
    prompt = f"""You are an expert in multiple-choice questions. Your task is to select the best answer from the given options based on the provided context.
Context: {context}

Question: {question}

Options:
{options}

Between A, B, C and D the best option is the letter"""
    return prompt

options_str = ["A","B","C","D"]

def format_options(options: list):
    return '\n'.join(f"{options_str[i]}. {s}" for i, s in enumerate(options))




In [10]:
ds=load_dataset("juan-carvajal/maia-pln-2025-pubmed_QA_test_questions_contexts")

README.md:   0%|          | 0.00/654 [00:00<?, ?B/s]

data/train-00000-of-00001-3c3a879788dc56(…):   0%|          | 0.00/3.27M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
import torch
def format_context_as_response(context, k=5):
    formatted_data = []
    for c in context[:k]:
        formatted_data.append(f"Title: {c['title']}\nAbstract: {c['content']}")
    return "\n---------\n".join(formatted_data)
        

possible_answers = [" A", " B", " C", " D"," E"]
    
option_logits = [
        base_tokenizer.encode(option, add_special_tokens=False)[0]
        for option in possible_answers
]

print(option_logits)

base_model = model


def predict(examples):
    all_prompts = []
    options_list = examples["option"]
    for i, (context, options,question) in enumerate(zip(examples["contexts"], options_list,examples["question"])):
        context_text = format_context_as_response(context,1)
        formatted_options = format_options(options)
        prompt = generate_prompt(question, context_text, formatted_options)
        #print(prompt)
        all_prompts.append(prompt)
    #outputs = pipe(all_prompts)
    #print(all_prompts)
    tokens=base_tokenizer(all_prompts, return_tensors="pt",padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs=base_model(**tokens, do_sample=False,max_new_tokens=2,
        top_p=None,
        temperature=0)
    
    # Extract answers from model outputs
    answers = []
    print(outputs.logits.shape)
    last_token_logits = outputs.logits
    for i, sequence_logits in enumerate(last_token_logits):
        attention_mask = tokens['attention_mask'][i]
        last_token_pos = attention_mask.sum().item() - 1
        
        # Get logits for the last position
        last_token_logits = sequence_logits[last_token_pos]
        
        # Apply softmax to get probabilities
        probs = torch.softmax(last_token_logits, dim=-1).tolist()
        # Apply softmax to get probabilities
        #probs = torch.softmax(sequence_logits, dim=-1).tolist()

        # Calculate the probability for each option token
        scores = []
        for option_token_id in option_logits:
            score = probs[option_token_id]
            scores.append(score)

        # Choose the option with the highest probability
        chosen_option = scores.index(max(scores))
        print(f"Sample {i}: Scores = {scores}, Chosen = {chosen_option} ({possible_answers[chosen_option].strip()})")

        answers.append(chosen_option)
    
    # Add answers to the examples
    examples["answer"] = answers
    
    return examples

tmp = ds['train']#.select(range(100))
tmp = tmp.map(predict,batched=True,batch_size=50)

In [12]:
import pandas as pd
tmp.to_csv("results.csv")
df = pd.read_csv('results.csv')
df['ID'] = df['id']

df[['ID','answer']].to_csv('results.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]