In [None]:
import os
import numpy as np
import torch
from dataclasses import dataclass
from typing import Dict, List, Tuple

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    PreTrainedTokenizerBase,
    set_seed,
)
from peft import LoraConfig, get_peft_model
import evaluate

config = {
    "model": "Qwen/Qwen3-0.6B",   # or "meta-llama/Meta-Llama-3-8B"
    "output_dir": "./qwen3-0.6b-lora",
    "max_input_tokens": 384,
    "max_target_tokens": 128,
    "batch_size": 2,
    "grad_accum": 8,
    "learning_rate": 2e-5,
    "epochs": 4,
    "warmup_ratio": 0.03,
    "eval_steps": 400,
    "logging_steps": 400,
    "seed": 42,
    "load_4bit": True,
    "text_col": "text",
    "summary_col": "summary",
    "lora_r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "optim": "adamw_bnb_8bit"
}

# Prompt template
INSTR_PREFIX = (
    "You are a helpful assistant that writes concise, faithful summaries of legislative text.\n\n"
    "Task: Read the following bill text and write a clear, accurate summary.\n\n"
    "Bill Text:\n"
)
RESPONSE_PREFIX = "\n\nSummary:"


# ---------------------------
# Build samples & masking
# ---------------------------
def format_example(ex: Dict[str, str], text_col: str, sum_col: str) -> Tuple[str, str]:
    src = (ex.get(text_col) or "").strip()
    tgt = (ex.get(sum_col) or "").strip()
    prompt = f"{INSTR_PREFIX}{src}{RESPONSE_PREFIX} "
    return prompt, tgt

def tokenize_and_mask(example: Dict[str, str], tokenizer, cfg) -> Dict[str, List[int]]:
    prompt, target = format_example(example, cfg["text_col"], cfg["summary_col"])
    prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"][:cfg["max_input_tokens"]]
    target_ids = tokenizer(target, add_special_tokens=False)["input_ids"][:cfg["max_target_tokens"]]

    input_ids = prompt_ids + target_ids + [tokenizer.eos_token_id]
    input_ids = input_ids[:cfg["max_input_tokens"] + cfg["max_target_tokens"]]

    labels = [-100] * len(prompt_ids) + target_ids + [tokenizer.eos_token_id]
    labels = labels[:cfg["max_input_tokens"] + cfg["max_target_tokens"]]

    attention_mask = [1] * len(input_ids)
    return {"input_ids": input_ids, "labels": labels, "attention_mask": attention_mask}


# ---------------------------
# Collator (pads labels with -100)
# ---------------------------
@dataclass
class DataCollatorForCausalLMWithMaskedLabels:
    tokenizer: PreTrainedTokenizerBase
    label_pad_token_id: int = -100
    pad_to_multiple_of: int = 8

    def __call__(self, features):
        max_len = max(len(f["input_ids"]) for f in features)
        if self.pad_to_multiple_of and max_len % self.pad_to_multiple_of:
            max_len = ((max_len // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of

        input_ids, attn, labels = [], [], []
        for f in features:
            pad = max_len - len(f["input_ids"])
            input_ids.append(f["input_ids"] + [self.tokenizer.pad_token_id] * pad)
            attn.append(f["attention_mask"] + [0] * pad)
            labels.append(f["labels"] + [self.label_pad_token_id] * pad)

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attn, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }


# ---------------------------
# Metrics (ROUGE + BLEU)
# ---------------------------
rouge_metric = evaluate.load("rouge")
bleu_metric  = evaluate.load("bleu")

def _postprocess_text(preds: List[str], labels: List[str]) -> Tuple[List[str], List[str]]:
    preds  = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    preds  = ["\n".join(p.splitlines()) for p in preds]
    labels = ["\n".join(l.splitlines()) for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds  = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = _postprocess_text(decoded_preds, decoded_labels)

    rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    bleu = bleu_metric.compute(predictions=decoded_preds, references=[[r] for r in decoded_labels])

    return {
        "rouge1": round(rouge["rouge1"] * 100, 4),
        "rouge2": round(rouge["rouge2"] * 100, 4),
        "rougeL": round(rouge["rougeL"] * 100, 4),
        "rougeLsum": round(rouge["rougeLsum"] * 100, 4),
        "bleu": round(bleu["bleu"] * 100, 4),
    }

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
set_seed(config["seed"])
tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=True, trust_remote_code=True, truncate=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [3]:
raw = load_dataset("billsum")
# val_size = 2000 if len(raw["train"]) > 40000 else max(1000, int(0.05 * len(raw["train"])))
val_size = 2000
split = raw["train"].train_test_split(test_size=val_size, seed=config["seed"])
train_ds_raw, val_ds_raw = split["train"], split["test"]

max_len = config["max_input_tokens"] + config["max_target_tokens"]
train_ds = train_ds_raw.map(lambda ex: tokenize_and_mask(ex, tokenizer, config),
                            remove_columns=train_ds_raw.column_names, desc="Tokenize train")
val_ds   = val_ds_raw.map(lambda ex: tokenize_and_mask(ex, tokenizer, config),
                          remove_columns=val_ds_raw.column_names, desc="Tokenize eval")

data_collator = DataCollatorForCausalLMWithMaskedLabels(tokenizer=tokenizer, pad_to_multiple_of=8)

### train

In [5]:
bnb_kwargs = {}
dtype = torch.bfloat16 if torch.cuda.is_available() else None
if config["load_4bit"]:
    bnb_kwargs = dict(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        device_map="auto",
    )

base_model = AutoModelForCausalLM.from_pretrained(
    config["model"],
    torch_dtype=dtype if not config["load_4bit"] else None,
    trust_remote_code=True,
    **bnb_kwargs,
)

lora_cfg = LoraConfig(
    r=config["lora_r"],
    lora_alpha=config["lora_alpha"],
    lora_dropout=config["lora_dropout"],
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","W_pack"],
)
model = get_peft_model(base_model, lora_cfg)

model.generation_config.update(
    max_new_tokens=config["max_target_tokens"],
    num_beams=1,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{}

In [None]:
# ---------------------------
# Training args
# ---------------------------
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    gradient_accumulation_steps=config["grad_accum"],
    learning_rate=config["learning_rate"],
    lr_scheduler_type="cosine",
    warmup_ratio=config["warmup_ratio"],
    logging_steps=config["logging_steps"],
    save_strategy="steps",
    save_steps=config["eval_steps"],
    eval_strategy="steps",
    # eval_steps=config["eval_steps"],

    fp16=False,
    report_to="none",
    load_best_model_at_end=True,
    # metric_for_best_model="rougeL",
    greater_is_better=True,
    seed=config["seed"],
    optim=config["optim"],
)


# ---------------------------
# Trainer
# ---------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

trainer.train()

# Save LoRA adapter only
adapter_dir = os.path.join(config["output_dir"], "lora_adapter")
os.makedirs(adapter_dir, exist_ok=True)
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(config["output_dir"])

print(f"\nDone. LoRA adapter saved to: {adapter_dir}")

### inference

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# paths
base_model_name = "Qwen/Qwen3-1.7B"   # or whatever base model you used
ckpt_path = "/home/khointn/summarization_adapter/nbs/qwen3-1.7b-lora-384ctx/checkpoint-600"

# reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True, trust_remote_code=True, truncate=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# reload base model
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.bfloat16)

# attach LoRA weights from checkpoint
model = PeftModel.from_pretrained(base_model, ckpt_path)

# (optional) merge for faster inference
try:
    model = model.merge_and_unload()
except Exception:
    pass

# put on device
device = "cuda" # if torch.cuda.is_available() else "cpu"
model.generation_config.update(
    max_new_tokens=config["max_target_tokens"],
    num_beams=1,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

model = model.to(device)
model.eval()

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.69it/s]


: 

In [5]:
import torch, gc
import numpy as np
import evaluate

# Reuse your tokenizer, model, and eval_ds (tokenized with input_ids, attention_mask, labels)
# Assumptions:
#   - eval_ds[i]["input_ids"] / ["attention_mask"] / ["labels"] are python lists of ints
#   - tokenizer has pad_token_id and eos_token_id set

rouge_metric = evaluate.load("rouge")
bleu_metric  = evaluate.load("bleu")

@torch.no_grad()
def streaming_eval(
    model,
    tokenizer,
    eval_ds,
    *,
    n_samples=256,              # limit for quick eval
    batch_size=1,               # keep tiny
    max_input_ctx=None,         # hard cap input ctx tokens (optional)
    max_new_tokens=128,         # cap generated length
    device=None,
    clear_cuda_every=8,         # free CUDA cache periodically
):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    model.eval().to(device)

    # Process in small slices, no big lists
    end = min(n_samples, len(eval_ds))
    for start in range(0, end, batch_size):
        stop = min(start + batch_size, end)
        batch = eval_ds.select(range(start, stop))

        # 1) Truncate inputs (optional safety cap)
        input_ids = batch["input_ids"]
        attention = batch["attention_mask"]
        labels    = batch["labels"]

        if max_input_ctx is not None:
            input_ids = [ids[:max_input_ctx] for ids in input_ids]
            attention = [att[:max_input_ctx] for att in attention]
            # labels are not fed to generate, no need to truncate here

        # 2) Pad this batch
        max_len = max(len(x) for x in input_ids)
        pad_id = tokenizer.pad_token_id
        batch_input = [x + [pad_id]*(max_len - len(x)) for x in input_ids]
        batch_attn  = [x + [0]*(max_len - len(x))  for x in attention]

        batch_input = torch.tensor(batch_input, dtype=torch.long, device=device)
        batch_attn  = torch.tensor(batch_attn,  dtype=torch.long, device=device)

        # 3) Generate
        gen = model.generate(
            input_ids=batch_input,
            attention_mask=batch_attn,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            num_beams=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

        # 4) Decode predictions (streaming: only this batch)
        decoded_preds = tokenizer.batch_decode(gen, skip_special_tokens=True)

        # 5) Prepare and decode labels for this batch (replace -100 with pad for decode)
        #    (We never feed labels to the model here, just for metrics.)
        max_lab = max(len(l) for l in labels)
        labels_padded = [
            [tok if tok != -100 else pad_id for tok in l] + [pad_id]*(max_lab - len(l))
            for l in labels
        ]
        decoded_refs = tokenizer.batch_decode(
            torch.tensor(labels_padded, dtype=torch.long), skip_special_tokens=True
        )

        # 6) Minimal post-processing for ROUGE-Lsum
        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_refs  = [r.strip() for r in decoded_refs]
        decoded_preds = ["\n".join(p.splitlines()) for p in decoded_preds]
        decoded_refs  = ["\n".join(r.splitlines()) for r in decoded_refs]

        # 7) Stream into metrics (no big lists kept)
        rouge_metric.add_batch(predictions=decoded_preds, references=decoded_refs)
        bleu_metric.add_batch(predictions=decoded_preds, references=[[r] for r in decoded_refs])

        # 8) Free everything we can
        del batch_input, batch_attn, gen
        if torch.cuda.is_available() and (start // batch_size) % clear_cuda_every == 0:
            torch.cuda.empty_cache()
        gc.collect()

    # 9) Compute final metrics
    rouge = rouge_metric.compute(use_stemmer=True)
    bleu  = bleu_metric.compute()
    return {
        "rouge1": round(rouge["rouge1"] * 100, 4),
        "rouge2": round(rouge["rouge2"] * 100, 4),
        "rougeL": round(rouge["rougeL"] * 100, 4),
        "rougeLsum": round(rouge["rougeLsum"] * 100, 4),
        "bleu": round(bleu["bleu"] * 100, 4),
    }

# ---- Usage example (super light) ----
scores = streaming_eval(
    model,
    tokenizer,
    val_ds,
    n_samples=100,           # evaluate on a tiny slice
    batch_size=1,            # minimize peak RAM
    max_input_ctx=512,       # hard cap context if your eval_ds has longer
    max_new_tokens=128,      # short generations
)
print(scores)
# qwen 1.7-384ctx checkpoint 600

{'rouge1': np.float64(35.4827), 'rouge2': np.float64(35.2095), 'rougeL': np.float64(35.469), 'rougeLsum': np.float64(35.3707), 'bleu': 20.3816}


In [None]:
import torch, gc
import numpy as np
import evaluate

# Reuse your tokenizer, model, and eval_ds (tokenized with input_ids, attention_mask, labels)
# Assumptions:
#   - eval_ds[i]["input_ids"] / ["attention_mask"] / ["labels"] are python lists of ints
#   - tokenizer has pad_token_id and eos_token_id set

rouge_metric = evaluate.load("rouge")
bleu_metric  = evaluate.load("bleu")

@torch.no_grad()
def streaming_eval(
    model,
    tokenizer,
    eval_ds,
    *,
    n_samples=256,              # limit for quick eval
    batch_size=1,               # keep tiny
    max_input_ctx=None,         # hard cap input ctx tokens (optional)
    max_new_tokens=128,         # cap generated length
    device=None,
    clear_cuda_every=8,         # free CUDA cache periodically
):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    model.eval().to(device)

    # Process in small slices, no big lists
    end = min(n_samples, len(eval_ds))
    for start in range(0, end, batch_size):
        stop = min(start + batch_size, end)
        batch = eval_ds.select(range(start, stop))

        # 1) Truncate inputs (optional safety cap)
        input_ids = batch["input_ids"]
        attention = batch["attention_mask"]
        labels    = batch["labels"]

        if max_input_ctx is not None:
            input_ids = [ids[:max_input_ctx] for ids in input_ids]
            attention = [att[:max_input_ctx] for att in attention]
            # labels are not fed to generate, no need to truncate here

        # 2) Pad this batch
        max_len = max(len(x) for x in input_ids)
        pad_id = tokenizer.pad_token_id
        batch_input = [x + [pad_id]*(max_len - len(x)) for x in input_ids]
        batch_attn  = [x + [0]*(max_len - len(x))  for x in attention]

        batch_input = torch.tensor(batch_input, dtype=torch.long, device=device)
        batch_attn  = torch.tensor(batch_attn,  dtype=torch.long, device=device)

        # 3) Generate
        gen = model.generate(
            input_ids=batch_input,
            attention_mask=batch_attn,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            num_beams=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

        # 4) Decode predictions (streaming: only this batch)
        decoded_preds = tokenizer.batch_decode(gen, skip_special_tokens=True)

        # 5) Prepare and decode labels for this batch (replace -100 with pad for decode)
        #    (We never feed labels to the model here, just for metrics.)
        max_lab = max(len(l) for l in labels)
        labels_padded = [
            [tok if tok != -100 else pad_id for tok in l] + [pad_id]*(max_lab - len(l))
            for l in labels
        ]
        decoded_refs = tokenizer.batch_decode(
            torch.tensor(labels_padded, dtype=torch.long), skip_special_tokens=True
        )

        # 6) Minimal post-processing for ROUGE-Lsum
        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_refs  = [r.strip() for r in decoded_refs]
        decoded_preds = ["\n".join(p.splitlines()) for p in decoded_preds]
        decoded_refs  = ["\n".join(r.splitlines()) for r in decoded_refs]

        # 7) Stream into metrics (no big lists kept)
        rouge_metric.add_batch(predictions=decoded_preds, references=decoded_refs)
        bleu_metric.add_batch(predictions=decoded_preds, references=[[r] for r in decoded_refs])

        # 8) Free everything we can
        del batch_input, batch_attn, gen
        if torch.cuda.is_available() and (start // batch_size) % clear_cuda_every == 0:
            torch.cuda.empty_cache()
        gc.collect()

    # 9) Compute final metrics
    rouge = rouge_metric.compute(use_stemmer=True)
    bleu  = bleu_metric.compute()
    return {
        "rouge1": round(rouge["rouge1"] * 100, 4),
        "rouge2": round(rouge["rouge2"] * 100, 4),
        "rougeL": round(rouge["rougeL"] * 100, 4),
        "rougeLsum": round(rouge["rougeLsum"] * 100, 4),
        "bleu": round(bleu["bleu"] * 100, 4),
    }

# ---- Usage example (super light) ----
scores = streaming_eval(
    model,
    tokenizer,
    val_ds,
    n_samples=100,           # evaluate on a tiny slice
    batch_size=1,            # minimize peak RAM
    max_input_ctx=512,       # hard cap context if your eval_ds has longer
    max_new_tokens=128,      # short generations
)
print(scores)
# qwen 1.7-384ctx checkpoint 200

{'rouge1': np.float64(34.2349), 'rouge2': np.float64(33.9942), 'rougeL': np.float64(34.2084), 'rougeLsum': np.float64(34.1441), 'bleu': 19.5966}


In [None]:
import torch, gc
import numpy as np
import evaluate

# Reuse your tokenizer, model, and eval_ds (tokenized with input_ids, attention_mask, labels)
# Assumptions:
#   - eval_ds[i]["input_ids"] / ["attention_mask"] / ["labels"] are python lists of ints
#   - tokenizer has pad_token_id and eos_token_id set

rouge_metric = evaluate.load("rouge")
bleu_metric  = evaluate.load("bleu")

@torch.no_grad()
def streaming_eval(
    model,
    tokenizer,
    eval_ds,
    *,
    n_samples=256,              # limit for quick eval
    batch_size=1,               # keep tiny
    max_input_ctx=None,         # hard cap input ctx tokens (optional)
    max_new_tokens=128,         # cap generated length
    device=None,
    clear_cuda_every=8,         # free CUDA cache periodically
):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    model.eval().to(device)

    # Process in small slices, no big lists
    end = min(n_samples, len(eval_ds))
    for start in range(0, end, batch_size):
        stop = min(start + batch_size, end)
        batch = eval_ds.select(range(start, stop))

        # 1) Truncate inputs (optional safety cap)
        input_ids = batch["input_ids"]
        attention = batch["attention_mask"]
        labels    = batch["labels"]

        if max_input_ctx is not None:
            input_ids = [ids[:max_input_ctx] for ids in input_ids]
            attention = [att[:max_input_ctx] for att in attention]
            # labels are not fed to generate, no need to truncate here

        # 2) Pad this batch
        max_len = max(len(x) for x in input_ids)
        pad_id = tokenizer.pad_token_id
        batch_input = [x + [pad_id]*(max_len - len(x)) for x in input_ids]
        batch_attn  = [x + [0]*(max_len - len(x))  for x in attention]

        batch_input = torch.tensor(batch_input, dtype=torch.long, device=device)
        batch_attn  = torch.tensor(batch_attn,  dtype=torch.long, device=device)

        # 3) Generate
        gen = model.generate(
            input_ids=batch_input,
            attention_mask=batch_attn,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            num_beams=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

        # 4) Decode predictions (streaming: only this batch)
        decoded_preds = tokenizer.batch_decode(gen, skip_special_tokens=True)

        # 5) Prepare and decode labels for this batch (replace -100 with pad for decode)
        #    (We never feed labels to the model here, just for metrics.)
        max_lab = max(len(l) for l in labels)
        labels_padded = [
            [tok if tok != -100 else pad_id for tok in l] + [pad_id]*(max_lab - len(l))
            for l in labels
        ]
        decoded_refs = tokenizer.batch_decode(
            torch.tensor(labels_padded, dtype=torch.long), skip_special_tokens=True
        )

        # 6) Minimal post-processing for ROUGE-Lsum
        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_refs  = [r.strip() for r in decoded_refs]
        decoded_preds = ["\n".join(p.splitlines()) for p in decoded_preds]
        decoded_refs  = ["\n".join(r.splitlines()) for r in decoded_refs]

        # 7) Stream into metrics (no big lists kept)
        rouge_metric.add_batch(predictions=decoded_preds, references=decoded_refs)
        bleu_metric.add_batch(predictions=decoded_preds, references=[[r] for r in decoded_refs])

        # 8) Free everything we can
        del batch_input, batch_attn, gen
        if torch.cuda.is_available() and (start // batch_size) % clear_cuda_every == 0:
            torch.cuda.empty_cache()
        gc.collect()

    # 9) Compute final metrics
    rouge = rouge_metric.compute(use_stemmer=True)
    bleu  = bleu_metric.compute()
    return {
        "rouge1": round(rouge["rouge1"] * 100, 4),
        "rouge2": round(rouge["rouge2"] * 100, 4),
        "rougeL": round(rouge["rougeL"] * 100, 4),
        "rougeLsum": round(rouge["rougeLsum"] * 100, 4),
        "bleu": round(bleu["bleu"] * 100, 4),
    }

# ---- Usage example (super light) ----
scores = streaming_eval(
    model,
    tokenizer,
    val_ds,
    n_samples=100,           # evaluate on a tiny slice
    batch_size=1,            # minimize peak RAM
    max_input_ctx=512,       # hard cap context if your eval_ds has longer
    max_new_tokens=128,      # short generations
)
print(scores)

# qwen 0.6 checkpoint 2000

{'rouge1': np.float64(39.5134), 'rouge2': np.float64(39.2952), 'rougeL': np.float64(39.5022), 'rougeLsum': np.float64(39.3779), 'bleu': 22.8359}


In [8]:
# eval_args = TrainingArguments(
#     output_dir="./tmp-eval",
#     per_device_eval_batch_size=1,          # keep tiny
#     # eval_accumulation_steps=1,             # stream metrics to CPU to save GPU RAM
#     # dataloader_pin_memory=False,           # sometimes helps stability
#     fp16=False,                            # keep off unless you know bf16/amp works
#     bf16=False,
#     report_to="none",
# )

# trainer=Trainer(
#             model=model,
#             args=eval_args,
#             # train_dataset=train_ds,
#             eval_dataset=val_ds,
#             tokenizer=tokenizer,
#             data_collator=data_collator,
#             compute_metrics=compute_metrics)
# n_samples = 4
# subset = val_ds.select(range(min(n_samples, len(val_ds))))

# scores = trainer.evaluate(
#     subset,
#     metric_key_prefix="quick_eval",
# )

# print("Eval result:", scores)

## slm

In [1]:
import os
from typing import Dict, List, Tuple
import numpy as np
import torch
from dataclasses import dataclass

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    PreTrainedTokenizerBase,
)

import evaluate

# =======================
# Config
# =======================
MODEL_NAME = "Qwen/Qwen3-0.6B"  # or "Qwen/Qwen2.5-1.5B"
OUTPUT_DIR = "qwen3-0.6b"

TEXT_COLUMN = os.environ.get("TEXT_COLUMN", "text")
SUMMARY_COLUMN = os.environ.get("SUMMARY_COLUMN", "summary")

MAX_INPUT_TOKENS = int(os.environ.get("MAX_INPUT_TOKENS", 512))
MAX_TARGET_TOKENS = int(os.environ.get("MAX_TARGET_TOKENS", 128))
MAX_SEQ_LEN = MAX_INPUT_TOKENS + MAX_TARGET_TOKENS

BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 2))
GRAD_ACCUM = int(os.environ.get("GRAD_ACCUM", 8))
LR = float(os.environ.get("LR", 2e-5))
NUM_EPOCHS = float(os.environ.get("NUM_EPOCHS", 1))
WARMUP_RATIO = float(os.environ.get("WARMUP_RATIO", 0.03))
EVAL_STEPS = int(os.environ.get("EVAL_STEPS", 20))
LOGGING_STEPS = int(os.environ.get("LOGGING_STEPS", 20))

# Prompt format
INSTR_PREFIX = (
    "You are a helpful assistant that writes concise, faithful summaries of legislative text.\n\n"
    "Task: Read the following bill text and write a clear, accurate summary.\n\n"
    "Bill Text:\n"
)
RESPONSE_PREFIX = "\n\nSummary:"  # we will mask loss before this

# =======================
# Tokenizer & Model
# =======================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# Many decoder-only tokenizers don't come with a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
)

# =======================
# Dataset
# =======================
raw = load_dataset("billsum", split="test")
# small validation slice
# if len(raw["train"]) > 40000:
#     val_size = 2000
# else:
#     val_size = max(1000, int(0.05 * len(raw["train"])))
split = raw.train_test_split(test_size=0.2, seed=42)
train_ds = split["train"]
val_ds = split["test"]

# =======================
# Build samples with masked labels
# =======================
response_template = RESPONSE_PREFIX + " "
response_template_ids = tokenizer(response_template, add_special_tokens=False).input_ids

def format_example(ex: Dict[str, str]) -> Tuple[str, str]:
    src = (ex.get(TEXT_COLUMN) or "").strip()
    tgt = (ex.get(SUMMARY_COLUMN) or "").strip()
    prompt = f"{INSTR_PREFIX}{src}{RESPONSE_PREFIX} "
    target = tgt
    return prompt, target

def tokenize_and_mask(example: Dict[str, str]) -> Dict[str, List[int]]:
    prompt, target = format_example(example)

    # Truncate prompt/target separately for better control
    prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"][:MAX_INPUT_TOKENS]
    target_ids = tokenizer(target, add_special_tokens=False)["input_ids"][:MAX_TARGET_TOKENS]

    input_ids = prompt_ids + target_ids + [tokenizer.eos_token_id]
    input_ids = input_ids[:MAX_SEQ_LEN]

    # Build labels: -100 over the prompt tokens, actual ids over the target
    labels = [-100] * len(prompt_ids) + target_ids + [tokenizer.eos_token_id]
    labels = labels[:MAX_SEQ_LEN]

    attention_mask = [1] * len(input_ids)
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask,
    }

train_ds = train_ds.map(tokenize_and_mask, remove_columns=train_ds.column_names, desc="Tokenize train")
val_ds = val_ds.map(tokenize_and_mask, remove_columns=val_ds.column_names, desc="Tokenize eval")

# =======================
# Data collator (pads labels with -100)
# =======================
@dataclass
class DataCollatorForCausalLMWithMaskedLabels:
    tokenizer: PreTrainedTokenizerBase
    label_pad_token_id: int = -100
    pad_to_multiple_of: int = 8

    def __call__(self, features):
        # features: list of dicts with input_ids, labels, attention_mask
        batch_input_ids, batch_labels, batch_attention = [], [], []
        max_len = max(len(f["input_ids"]) for f in features)
        if self.pad_to_multiple_of:
            # round up to nearest multiple
            if max_len % self.pad_to_multiple_of != 0:
                max_len = ((max_len // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of

        for f in features:
            ids = f["input_ids"]
            attn = f["attention_mask"]
            lbl = f["labels"]

            pad_len = max_len - len(ids)
            batch_input_ids.append(ids + [self.tokenizer.pad_token_id] * pad_len)
            batch_attention.append(attn + [0] * pad_len)
            batch_labels.append(lbl + [self.label_pad_token_id] * pad_len)

        return {
            "input_ids": torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention, dtype=torch.long),
            "labels": torch.tensor(batch_labels, dtype=torch.long),
        }

data_collator = DataCollatorForCausalLMWithMaskedLabels(tokenizer=tokenizer, pad_to_multiple_of=8)

# =======================
# Metrics (ROUGE + BLEU)
# =======================
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

def _postprocess_text(preds: List[str], labels: List[str]) -> Tuple[List[str], List[str]]:
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    # ROUGE-Lsum expects sentence-per-line
    preds = ["\n".join(p.splitlines()) for p in preds]
    labels = ["\n".join(l.splitlines()) for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 in labels so we can decode
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = _postprocess_text(decoded_preds, decoded_labels)

    rouge = rouge_metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    bleu = bleu_metric.compute(
        predictions=decoded_preds, references=[[r] for r in decoded_labels]
    )

    return {
        "rouge1": round(rouge["rouge1"] * 100, 4),
        "rouge2": round(rouge["rouge2"] * 100, 4),
        "rougeL": round(rouge["rougeL"] * 100, 4),
        "rougeLsum": round(rouge["rougeLsum"] * 100, 4),
        "bleu": round(bleu["bleu"] * 100, 4),
    }

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Tokenize train: 100%|██████████| 2615/2615 [00:15<00:00, 172.72 examples/s]
Tokenize eval: 100%|██████████| 654/654 [00:03<00:00, 181.55 examples/s]


In [None]:
# =======================
# Training args
# =======================
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=WARMUP_RATIO,

    logging_steps=LOGGING_STEPS,
    save_strategy="steps",
    eval_strategy="steps",
    save_steps=EVAL_STEPS,
    eval_steps=EVAL_STEPS,

    # Enable generation-based eval
    # predict_with_generate=True,
    # generation_max_length=MAX_TARGET_TOKENS,
    # generation_num_beams=1,

    fp16=False,  # set bf16=True if your hardware supports it (and torch.bfloat16 above)
    report_to="none",

    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
)

# =======================
# Trainer
# =======================
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save final
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("\nDone. Model & tokenizer saved to:", OUTPUT_DIR)

## flan-t5

In [1]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
billsum = load_dataset("cnn_dailymail", "3.0.0", split="test")
billsum = billsum.train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 9192
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2298
    })
})

In [3]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map: 100%|██████████| 9192/9192 [00:09<00:00, 980.78 examples/s] 
Map: 100%|██████████| 2298/2298 [00:01<00:00, 1394.19 examples/s]


In [6]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [7]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result_rouge = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result_rouge["gen_len"] = np.mean(prediction_lens)

    result_bleu = bleu.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels],
    )

    # Combine
    return {
        "rouge1": round(result_rouge["rouge1"] * 100, 4),
        "rouge2": round(result_rouge["rouge2"] * 100, 4),
        "rougeL": round(result_rouge["rougeL"] * 100, 4),
        "rougeLsum": round(result_rouge["rougeLsum"] * 100, 4),
        "bleu": round(result_bleu["bleu"] * 100, 4),
    }

Downloading builder script: 5.94kB [00:00, 8.83MB/s]
Downloading extra modules: 3.34kB [00:00, 7.24MB/s]


In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="flan-t5",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [7]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [8]:
from transformers import pipeline

trained_checkpoint = "/home/khointn/summarization_adapter/nbs/my_awesome_billsum_model/checkpoint-620"
summarizer = pipeline("summarization", model=trained_checkpoint)
summarizer(text)

Device set to use cuda:0
Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]