In [1]:
import os, sys, math, json, random, gc, csv, time
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer
)
from transformers.trainer_callback import TrainerCallback, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
CUT_OFF_LEN = 1024
LOSS_WEIGHTS = [0.3, 0.3, 0.2, 0.1, 0.1]
EVAL_STEPS = 300
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.2
TOP_P = 0.9
OUTPUT_DIR = "outputs/llama3.2-3b-pls-qlora"
METRICS_CSV = f"{OUTPUT_DIR}/training_metrics.csv"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Device explícito
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    print("CUDA:", torch.cuda.get_device_name(0))



Device: cuda
CUDA: NVIDIA GeForce RTX 3090


In [3]:
HERE = Path.cwd()
found = None
for p in [HERE] + list(HERE.parents):
    if (p / "metricas").is_dir():
        found = p
        break
if not found:
    raise RuntimeError("No se encontró la carpeta 'metricas' en ningún ancestro desde este notebook.")
if str(found) not in sys.path:
    sys.path.insert(0, str(found))

from metricas.metrics_client import getLoss


In [None]:
# Cargar el dataset

N_TRAIN = 1500  #  None para usar todo 
df = pd.read_csv("../../data/pls_abstract_pairs_with_metrics.csv")

train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "test"].reset_index(drop=True)  

if N_TRAIN is not None:
    train_df = train_df.sample(n=min(N_TRAIN, len(train_df)), random_state=42).reset_index(drop=True)

len(train_df), len(val_df)


(500, 218)

In [5]:
SYSTEM_PROMPT = (
    "You simplify clinical trial protocol text into a plain-language summary for the general public. "
    "Keep to 6–8th grade readability, avoid diagnoses and speculation, no hallucinations, "
    "and preserve key facts (objective, population, interventions, outcomes, timelines, safety)."
)
USER_PREFIX = "Using the following clinical trial protocol text as input, create a plain language summary.\n\n"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, token=os.getenv("HF_TOKEN"))
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def build_chat(src: str, tgt: str):
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PREFIX + str(src)},
        {"role": "assistant", "content": str(tgt)},
    ]

def encode_supervised(batch):
    chats = [build_chat(s, t) for s, t in zip(batch["source_text"], batch["target_text"])]
    texts = [tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=False) for c in chats]
    out = tokenizer(texts, truncation=True, max_length=CUT_OFF_LEN, padding=False)
    return out

hf_train = Dataset.from_pandas(train_df[["source_text","target_text"]]).map(
    encode_supervised, batched=True, remove_columns=["source_text","target_text"]
)
hf_val = Dataset.from_pandas(val_df[["source_text","target_text"]]).map(
    encode_supervised, batched=True, remove_columns=["source_text","target_text"]
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


Map: 100%|██████████| 500/500 [00:00<00:00, 2534.00 examples/s]
Map: 100%|██████████| 218/218 [00:00<00:00, 2025.74 examples/s]


In [6]:
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, token=os.getenv("HF_TOKEN"))
tokenizer.padding_side = "right"


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# FP16 en GPU; NO device_map="auto" para que no se vaya a CPU
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16
).to(device)

# Solo LoRA (sin prepare_model_for_kbit_training)
lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)
model = get_peft_model(model, lora_cfg)
model.config.use_cache = False 
model.config.pad_token_id = tokenizer.pad_token_id
model.gradient_checkpointing_enable()
model.enable_input_require_grads() 
model.train()

# Sanity
print("Model device:", next(model.parameters()).device)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable/1e6:.2f}M / {total/1e6:.2f}M ({100*trainable/total:.2f}%)")


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]


Model device: cuda:0
Trainable: 24.31M / 3237.06M (0.75%)


In [7]:
@torch.no_grad()
def eval_composite_loss(model, tokenizer, val_df, loss_weights, sample_size=64):
    if len(val_df) == 0:
        return None
    sub = val_df.sample(n=min(sample_size, len(val_df)), random_state=42).reset_index(drop=True)
    prompts = [
        tokenizer.apply_chat_template(
            [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PREFIX + s}
            ],
            tokenize=False, add_generation_prompt=True
        )
        for s in sub["source_text"].tolist()
    ]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=CUT_OFF_LEN).to(device)
    gen = model.generate(
        **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=True,
        temperature=TEMPERATURE, top_p=TOP_P
    )
    outs = tokenizer.batch_decode(gen[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)

    losses = getLoss(
        sub["source_text"].tolist(),
        sub["target_text"].tolist(),
        outs,
        weights=loss_weights
    )
    if isinstance(losses, list):
        return float(sum(losses)/len(losses)), losses
    return float(losses), [float(losses)]


In [None]:
class TrainValLogger(TrainerCallback):
    def __init__(self, csv_path=METRICS_CSV):
        self.csv_path = csv_path
        os.makedirs(os.path.dirname(self.csv_path), exist_ok=True)
        if not os.path.exists(self.csv_path):
            with open(self.csv_path, "w", newline="") as f:
                csv.writer(f).writerow(["step","train_loss","eval_loss","lr","timestamp"])
    def on_log(self, args, state, control, **kwargs):
        logs = kwargs.get("logs", {})
        with open(self.csv_path, "a", newline="") as f:
            csv.writer(f).writerow([
                state.global_step,
                logs.get("loss"),
                logs.get("eval_loss"),
                logs.get("learning_rate"),
                time.time()
            ])


In [None]:
# Ajusta tu TrainingArguments así:
args = TrainingArguments(
    output_dir="outputs/llama3.2-3b-pls-lora-fp16",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,   
    num_train_epochs=3,
    fp16=True,  
    learning_rate=1e-4,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    logging_steps=25,
    logging_first_step=True, 
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    optim="adamw_torch",
    metric_for_best_model="eval_loss",     
    report_to="none",
    disable_tqdm=False   
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    data_collator=data_collator,
)
trainer.add_callback(TrainValLogger(METRICS_CSV))
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=1e-4))


print("Num training steps (approx):", math.ceil(len(hf_train)/ (args.per_device_train_batch_size) / args.gradient_accumulation_steps) * int(args.num_train_epochs))

torch.cuda.empty_cache()
trainer.train()

Num training steps (approx): 16


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss


TrainOutput(global_step=16, training_loss=1.865557312965393, metrics={'train_runtime': 339.6342, 'train_samples_per_second': 1.472, 'train_steps_per_second': 0.047, 'total_flos': 8731495982407680.0, 'train_loss': 1.865557312965393, 'epoch': 1.0})

In [10]:
save_dir = Path(OUTPUT_DIR) / "final"
save_dir.mkdir(parents=True, exist_ok=True)
trainer.model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Guardado en:", save_dir)

print("Métricas registradas en:", METRICS_CSV)

Guardado en: outputs\llama3.2-3b-pls-qlora\final
Métricas registradas en: outputs/llama3.2-3b-pls-qlora/training_metrics.csv


In [11]:
SYSTEM_PROMPT = ("You simplify clinical trial protocol text into a plain-language summary for the general public. "
                 "Keep to 6–8th grade readability, avoid diagnoses and speculation, no hallucinations, "
                 "and preserve key facts (objective, population, interventions, outcomes, timelines, safety).")
USER_PREFIX = "Using the following clinical trial protocol text as input, create a plain language summary.\n\n"

def build_chat_infer(src: str):
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PREFIX + src},
    ]

demo_src = val_df.iloc[0]["source_text"]
print(demo_src, '\n\n')
print('\n\n', "TEXTO GENERADO:", '\n\n')
chat = build_chat_infer(demo_src)
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

model.eval()
inputs = tokenizer(prompt, return_tensors="pt").to(device)
gen = model.generate(
    **inputs,
    max_new_tokens=512,
    do_sample=False,             
    temperature=None,
    top_p=None,
    no_repeat_ngram_size=6,      
    repetition_penalty=1.15,     
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
print(tokenizer.decode(gen[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True))

Background
Urinary schistosomiasis is caused by an intravascular infection with parasitic Schistosoma haematobium worms. The adult worms typically migrate to the venous plexus of the human bladder and excrete eggs which the infected person passes in their urine. Chronic infection can cause substantial morbidity and long‐term complications as the eggs become trapped in human tissues causing inflammation and fibrosis. We summarised evidence of drugs active against the infection. This is new edition of a review first published in 1997. 
Objectives
To evaluate the efficacy and safety of drugs for treating urinary schistosomiasis.
Search methods
We searched the Cochrane Infectious Diseases Group Specialized Register, MEDLINE, CENTRAL, EMBASE and LILACS and reference lists of articles up to 23 May 2014. 
Selection criteria
Randomized controlled trials (RCTs) of antischistosomal drugs and drug combinations compared to placebo, no intervention, or each other. 
Data collection and analysis
Two 

In [12]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

dfm = pd.read_csv(METRICS_CSV)

# Tipado y limpieza
for c in ("step","train_loss","eval_loss","lr"):
    if c in dfm.columns:
        dfm[c] = pd.to_numeric(dfm[c], errors="coerce")
dfm = dfm.rename(columns={"step": "step_num"})
dfm = dfm.dropna(subset=["train_loss","eval_loss","lr"], how="all")

# Ordenar y consolidar posibles duplicados por step
dfm = dfm.sort_values("step_num")
dfm = dfm.groupby("step_num", as_index=False).last()

def _series(df, col):
    s = df[["step_num", col]].dropna()
    return s["step_num"].to_numpy(dtype=float), s[col].to_numpy(dtype=float)

xs_tr, ys_tr = _series(dfm, "train_loss")
xs_ev, ys_ev = _series(dfm, "eval_loss")

plt.figure(figsize=(9,5))
if len(xs_tr): plt.plot(xs_tr, ys_tr, "-o", label="train_loss", lw=2, ms=4)
if len(xs_ev): plt.plot(xs_ev, ys_ev, "-o", label="eval_loss",  lw=2, ms=4)

# Eje Y ajustado sin “aire”
vals = np.concatenate([y for y in [ys_tr, ys_ev] if len(y)])
if len(vals):
    vmin, vmax = float(np.nanmin(vals)), float(np.nanmax(vals))
    pad = max(1e-4, 0.08*(vmax - vmin))
    plt.ylim(vmin - pad, vmax + pad)

plt.xlabel("step"); plt.ylabel("loss"); plt.title("Training vs Validation Loss")
plt.grid(alpha=0.3); plt.legend()
plt.show()





ValueError: need at least one array to concatenate

<Figure size 900x500 with 0 Axes>

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel

# base = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16).to(device)
# tok  = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
# tok.pad_token = tok.eos_token

# lora = PeftModel.from_pretrained(base, (Path(OUTPUT_DIR)/"final").as_posix()).to(device)
# lora.eval()
# print("Adapters cargados OK.")
