# English→Spanish LoRA/QLoRA Fine-Tuning

Fine-tune a causal language model on `loresiensis/corpus-en-es` with parameter-efficient adapters. Toggle QLoRA via the config to enable 4-bit training on larger checkpoints.

## Notebook Outline
- Install dependencies (transformers, datasets, peft, bitsandbytes, accelerate, evaluate, sacrebleu).
- Configure the base checkpoint, dataset, and LoRA/QLoRA hyperparameters.
- Load and tokenize the English→Spanish corpus with an instruction-style prompt.
- Prepare a LoRA-wrapped model (optionally quantized with QLoRA).
- Train with `Trainer`, track SacreBLEU, and persist adapters/merged weights.

In [None]:
# pip install -q transformers datasets peft bitsandbytes accelerate evaluate sacrebleu

In [None]:
import os
from typing import Dict, List

import numpy as np
import torch
from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import evaluate

In [None]:
config = {
    "base_model": "HuggingFaceTB/SmolLM-135M",
    "dataset_name": "loresiensis/corpus-en-es",
    "output_dir": "outputs/smol-lora",
    "use_qlora": True,
    "max_train_samples": 4000,
    "max_eval_samples": 500,
    "source_lang_key": "EN",
    "target_lang_key": "ES",
    "max_length": 256,
    "learning_rate": 2e-4,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "logging_steps": 10,
    "eval_steps": 100,
    "save_steps": 400
}

device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(config["output_dir"], exist_ok=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["base_model"], padding_side="right")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

raw_datasets: DatasetDict = load_dataset(config["dataset_name"])

if config["max_train_samples"]:
    raw_datasets["train"] = raw_datasets["train"].select(range(min(len(raw_datasets["train"]), config["max_train_samples"])))
if config["max_eval_samples"]:
    raw_datasets["test"] = raw_datasets["test"].select(range(min(len(raw_datasets["test"]), config["max_eval_samples"])))

prompt_template = "Translate the following English sentence into natural Spanish.\nEnglish: {src}\nSpanish:"

In [None]:
def build_prompt(text: str) -> str:
    return prompt_template.format(src=text.strip())

def tokenize_function(example: Dict[str, str]) -> Dict[str, List[int]]:
    source_text = example[config["source_lang_key"]]
    target_text = example[config["target_lang_key"]]
    prompt = build_prompt(source_text)
    full_text = f"{prompt} {target_text.strip()}"

    tokenized = tokenizer(full_text, truncation=True, max_length=config["max_length"])
    labels = tokenized["input_ids"].copy()
    prompt_ids = tokenizer(prompt, truncation=True, max_length=config["max_length"])["input_ids"]
    labels[: len(prompt_ids)] = [-100] * len(prompt_ids)
    tokenized["labels"] = labels
    return tokenized

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    remove_columns=raw_datasets["train"].column_names,
    desc="Tokenizing dataset",
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt",
    label_pad_token_id=-100,
)

In [None]:
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    references = tokenizer.batch_decode(labels, skip_special_tokens=True)
    references = [[ref] for ref in references]
    bleu = metric.compute(predictions=predictions, references=references)
    return {"sacrebleu": bleu["score"]}

In [None]:
def init_model():
    model_kwargs = {}
    if config["use_qlora"]:
        compute_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
        )
        model_kwargs["quantization_config"] = quant_config
        model_kwargs["device_map"] = "auto"
    else:
        model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    model = AutoModelForCausalLM.from_pretrained(config["base_model"], **model_kwargs)
    if config["use_qlora"]:
        model = prepare_model_for_kbit_training(model)
    else:
        model = model.to(device)
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)
    return model

In [None]:
model = init_model()
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["num_train_epochs"],
    per_device_train_batch_size=config["per_device_train_batch_size"],
    per_device_eval_batch_size=config["per_device_eval_batch_size"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    learning_rate=config["learning_rate"],
    warmup_ratio=config["warmup_ratio"],
    weight_decay=config["weight_decay"],
    logging_steps=config["logging_steps"],
    eval_strategy="steps",
    eval_steps=config["eval_steps"],
    save_strategy="steps",
    save_steps=config["save_steps"],
    save_total_limit=3,
    report_to="none",
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available(),
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
train_result = trainer.train()
train_result

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
adapter_dir = os.path.join(config["output_dir"], "lora-adapter")
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
adapter_dir

In [None]:
# Optional: merge adapters into the base model for export
def merge_and_save(base_model_dir: str, adapter_dir: str, merged_dir: str, dtype=torch.float16):
    base_model = AutoModelForCausalLM.from_pretrained(base_model_dir, torch_dtype=dtype)
    peft_model = PeftModel.from_pretrained(base_model, adapter_dir)
    merged_model = peft_model.merge_and_unload()
    merged_model.save_pretrained(merged_dir)
    tokenizer.save_pretrained(merged_dir)

# merge_and_save(config["base_model"], adapter_dir, os.path.join(config["output_dir"], "merged"))