# English→Spanish LoRA/QLoRA Fine-Tuning

Fine-tune a causal language model on `loresiensis/corpus-en-es` with parameter-efficient adapters. Toggle QLoRA via the config to enable 4-bit training on larger checkpoints.

## Notebook Outline
- Install dependencies (transformers, datasets, peft, bitsandbytes, accelerate, evaluate, sacrebleu).
- Configure the base checkpoint, dataset, and LoRA/QLoRA hyperparameters.
- Load and tokenize the English→Spanish corpus with an instruction-style prompt.
- Prepare a LoRA-wrapped model (optionally quantized with QLoRA).
- Train with `Trainer`, track SacreBLEU, and persist adapters/merged weights.

In [None]:
# pip install transformers datasets peft bitsandbytes accelerate evaluate sacrebleu

In [2]:
import os
from typing import Dict, List

import numpy as np
import torch
from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)
from transformers import DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import evaluate

In [3]:
config = {
    "base_model": "HuggingFaceTB/SmolLM-135M",
    "dataset_name": "loresiensis/corpus-en-es",
    "output_dir": "outputs/smol-lora",
    "use_qlora": True,
    "max_train_samples": 100_000,
    "max_eval_samples": 500,
    "source_lang_key": "EN",
    "target_lang_key": "ES",
    "max_length": 256,
    "learning_rate": 2e-4,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "logging_steps": 10,
    "eval_steps": 100,
    "save_steps": 400
}

device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(config["output_dir"], exist_ok=True)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config["base_model"], padding_side="right")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
from typing import Any
def preprocess_batch(batch: Dict[str, List[Any]]) -> Dict[str, Any]:
    en_batch = batch['EN']
    es_batch = batch['ES']
    # translations = batch["translation"]  # each item is {"en": ..., "es": ...}
    en_texts = en_batch
    es_texts = es_batch

    prompts = [
        "Translate the following English text into Spanish.\n\n"
        f"English: {en}\n"
        "Spanish:"
        for en in en_texts
    ]

    full_texts = [
        p + " " + es + tokenizer.eos_token
        for p, es in zip(prompts, es_texts)
    ]

    tokenized = tokenizer(
        full_texts,
        max_length=config["max_length"],
        padding="max_length",
        truncation=True,
    )

    # Create labels and mask out the prompt part, so only Spanish tokens incur loss
    labels = []
    for full_text, prompt, es in zip(full_texts, prompts, es_texts):
        # full sequence
        input_ids = tokenized["input_ids"][len(labels)]
        # get tokenized prompt (no eos, no target)
        prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]
        # initialize labels as a copy
        label_ids = input_ids.copy()

        # mask out prompt tokens (set to -100 so they are ignored by loss)
        prompt_len = len(prompt_ids)
        label_ids[:prompt_len] = [-100] * min(prompt_len, len(label_ids))

        labels.append(label_ids)

    tokenized["labels"] = labels
    tokenized['length'] = [len(ids) for ids in tokenized['input_ids']]
    return tokenized

In [None]:
raw_datasets = load_dataset(config["dataset_name"])

split = raw_datasets["train"].train_test_split(test_size=0.05, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

train_dataset = train_dataset.map(
    preprocess_batch,
    batched=True,
    remove_columns=train_dataset.column_names,
)

eval_dataset = eval_dataset.map(
    preprocess_batch,
    batched=True,
    remove_columns=eval_dataset.column_names,
)

In [5]:
from datasets import load_dataset
from datasets import Dataset

ds = load_dataset("Helsinki-NLP/opus-100", "en-es")
train_df = ds["train"].to_pandas()
test_df = ds["test"].to_pandas()
val_df = ds["validation"].to_pandas()

def preprocess_text(train_df, max_size=-1):
    # {'en': "It was the asbestos in here, that's what did it!", 'es': 'Fueron los asbestos aquí. ¡Eso es lo que ocurrió!'}
    train_df['EN'] = train_df['translation'].apply(lambda x: x['en'])
    train_df['ES'] = train_df['translation'].apply(lambda x: x['es'])
    # remove the original 'translation' column
    train_df = train_df.drop(columns=['translation'])
    # remove duplicate rows based on the 'en' column (keep the first occurrence)
    before = len(train_df)
    # remove nulls
    train_df = train_df.dropna(subset=['EN'])
    # apply strip
    train_df["EN"] = train_df["EN"].str.strip()
    train_df = train_df.drop_duplicates(subset="EN", keep="first").reset_index(drop=True)
    after = len(train_df)
    print(f"Removed {before - after} duplicates. New shape: {train_df.shape}")
    if max_size > 0:
        train_df = train_df.sample(max_size, random_state=42) 
    # to datasets
    
    train_ds = Dataset.from_pandas(train_df)
    return train_ds

train_dataset = preprocess_text(train_df, config['max_train_samples'])
eval_dataset = preprocess_text(val_df)
test_dataset = preprocess_text(test_df)

final_ds = DatasetDict({
    'train': train_dataset,
    'validation': eval_dataset,
    'test': test_dataset
})
# to datasets
final_ds.save_to_disk("en-es-dataset")

Removed 122441 duplicates. New shape: (877559, 2)
Removed 5 duplicates. New shape: (1995, 2)
Removed 3 duplicates. New shape: (1997, 2)


Saving the dataset (1/1 shards): 100%|██████████| 100000/100000 [00:00<00:00, 2721806.62 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1995/1995 [00:00<00:00, 681348.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1997/1997 [00:00<00:00, 795217.42 examples/s]


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [None]:
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred              # logits: (batch, seq_len, vocab_size)
    pred_ids = np.argmax(logits, axis=-1)   # -> (batch, seq_len)

    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    pred_strs = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_strs = tokenizer.batch_decode(labels, skip_special_tokens=True)

    references = [[ref] for ref in label_strs]
    bleu = metric.compute(predictions=pred_strs, references=references)
    return {"sacrebleu": bleu["score"]}

In [None]:
def init_model():
    model_kwargs = {}
    if config["use_qlora"]:
        compute_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
        )
        model_kwargs["quantization_config"] = quant_config
        model_kwargs["device_map"] = "auto"
    else:
        model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    model = AutoModelForCausalLM.from_pretrained(config["base_model"], **model_kwargs)
    if config["use_qlora"]:
        model = prepare_model_for_kbit_training(model)
    else:
        model = model.to(device)
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)
    return model

In [None]:
model = init_model()
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["num_train_epochs"],
    per_device_train_batch_size=config["per_device_train_batch_size"],
    per_device_eval_batch_size=config["per_device_eval_batch_size"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    learning_rate=config["learning_rate"],
    warmup_ratio=config["warmup_ratio"],
    weight_decay=config["weight_decay"],
    logging_steps=config["logging_steps"],
    eval_strategy="steps",
    eval_steps=config["eval_steps"],
    save_strategy="steps",
    save_steps=config["save_steps"],
    save_total_limit=3,
    report_to="none",
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available(),
)

In [None]:

adapter_dir = os.path.join(config["output_dir"], "lora-adapter")
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(config["output_dir"], exist_ok=True)
# Load the base model and apply the saved LoRA adapter
model = AutoModelForCausalLM.from_pretrained(config["base_model"], torch_dtype="auto", device_map="auto")
model = PeftModel.from_pretrained(model, adapter_dir)

# Re-initialize the Trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

metrics = trainer.evaluate()
print(metrics)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "utter-project/EuroLLM-1.7B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, use_safetensors=True)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

text = "Translate the following to Spanish.\n\nEnglish: My name is EuroLLM. \n\nSpanish:"

inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=125)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Translate the following to Spanish.

English: My name is EuroLLM. 

Spanish: Mi nombre es EuroLLM.

English: I am a student. 

Spanish: Soy estudiante.

English: I am a student. 

Spanish: Soy estudiante.

English: I am a student. 

Spanish: Soy estudiante.

English: I am a student. 

Spanish: Soy estudiante.

English: I am a student. 

Spanish: Soy estudiante.

English: I am a student. 

Spanish: Soy estudiante.

English: I am a student. 

Spanish: Soy estudiante
