# Englishâ†’Spanish LoRA/QLoRA Fine-Tuning

Fine-tune a causal language model on `loresiensis/corpus-en-es` with parameter-efficient adapters. Toggle QLoRA via the config to enable 4-bit training on larger checkpoints.

## Notebook Outline
- Install dependencies (transformers, datasets, peft, bitsandbytes, accelerate, evaluate, sacrebleu).
- Configure the base checkpoint, dataset, and LoRA/QLoRA hyperparameters.
- Load and tokenize the Englishâ†’Spanish corpus with an instruction-style prompt.
- Prepare a LoRA-wrapped model (optionally quantized with QLoRA).
- Train with `Trainer`, track SacreBLEU, and persist adapters/merged weights.

In [1]:
# pip install transformers datasets peft bitsandbytes accelerate evaluate sacrebleu

In [2]:
# This command configures the notebook to use the virtual environment's kernel.
# It's typically run once from the terminal, not within the notebook itself.
# !python -m ipykernel install --user --name=.venv --display-name "Python (.venv)"

In [3]:
import os
from typing import Dict, List

import numpy as np
import torch
from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)
from transformers import DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
config = {
    "base_model": "HuggingFaceTB/SmolLM-135M",
    "dataset_name": "loresiensis/corpus-en-es",
    "output_dir": "outputs/smol-lora",
    "use_qlora": False,
    "max_train_samples": 100_000,
    "max_eval_samples": 2500,
    "source_lang_key": "EN",
    "target_lang_key": "ES",
    "max_length": 512,
    "learning_rate": 2e-4,
    "num_train_epochs": 5,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "logging_steps": 1000,
    "eval_steps": 5000,
    "save_steps": 5000
}

device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(config["output_dir"], exist_ok=True)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(config["base_model"], padding_side="right")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# raw_datasets: DatasetDict = load_dataset(config["dataset_name"])

# if config["max_train_samples"]:
#     raw_datasets["train"] = raw_datasets["train"].select(range(min(len(raw_datasets["train"]), config["max_train_samples"])))
# if config["max_eval_samples"]:
#     raw_datasets["test"] = raw_datasets["test"].select(range(min(len(raw_datasets["test"]), config["max_eval_samples"])))

# prompt_template = "Translate the following English sentence into natural Spanish.\nEnglish: {src}\nSpanish:"

In [6]:
from typing import Any
def preprocess_batch(batch: Dict[str, List[Any]]) -> Dict[str, Any]:
    en_batch = batch['EN']
    es_batch = batch['ES']
    # translations = batch["translation"]  # each item is {"en": ..., "es": ...}
    en_texts = en_batch
    es_texts = es_batch

    prompts = [
        "Translate the following English text into Spanish.\n\n"
        f"English: {en}\n"
        "Spanish:"
        for en in en_texts
    ]

    full_texts = [
        p + " " + es + tokenizer.eos_token
        for p, es in zip(prompts, es_texts)
    ]

    tokenized = tokenizer(
        full_texts,
        max_length=config["max_length"],
        padding="max_length",
        truncation=True,
    )

    # Create labels and mask out the prompt part, so only Spanish tokens incur loss
    labels = []
    for full_text, prompt, es in zip(full_texts, prompts, es_texts):
        # full sequence
        input_ids = tokenized["input_ids"][len(labels)]
        # get tokenized prompt (no eos, no target)
        prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]
        # initialize labels as a copy
        label_ids = input_ids.copy()

        # mask out prompt tokens (set to -100 so they are ignored by loss)
        prompt_len = len(prompt_ids)
        label_ids[:prompt_len] = [-100] * min(prompt_len, len(label_ids))

        labels.append(label_ids)

    tokenized["labels"] = labels
    tokenized['length'] = [len(ids) for ids in tokenized['input_ids']]
    return tokenized

In [7]:
# raw_datasets = load_dataset(config["dataset_name"])

# # raw_datasets has only "train" split; create a small validation split
# split = raw_datasets["train"].train_test_split(test_size=0.05, seed=42)
# train_dataset = split["train"]
# eval_dataset = split["test"]

# print(train_dataset[0])
# train_dataset = train_dataset.map(
#     preprocess_batch,
#     batched=True,
#     remove_columns=train_dataset.column_names,
# )

# eval_dataset = eval_dataset.map(
#     preprocess_batch,
#     batched=True,
#     remove_columns=eval_dataset.column_names,
# )

In [8]:
from datasets import load_dataset
from datasets import Dataset

ds = load_dataset("Helsinki-NLP/opus-100", "en-es")
train_df = ds["train"].to_pandas()
# test_df = ds["test"].to_pandas()
val_df = ds["validation"].to_pandas()

def preprocess_text(train_df, max_size=-1):
    # {'en': "It was the asbestos in here, that's what did it!", 'es': 'Fueron los asbestos aquÃ­. Â¡Eso es lo que ocurriÃ³!'}
    train_df['EN'] = train_df['translation'].apply(lambda x: x['en'])
    train_df['ES'] = train_df['translation'].apply(lambda x: x['es'])
    # remove the original 'translation' column
    train_df = train_df.drop(columns=['translation'])
    # remove duplicate rows based on the 'en' column (keep the first occurrence)
    before = len(train_df)
    # remove nulls
    train_df = train_df.dropna(subset=['EN'])
    # apply strip
    train_df["EN"] = train_df["EN"].str.strip()
    train_df = train_df.drop_duplicates(subset="EN", keep="first").reset_index(drop=True)
    after = len(train_df)
    print(f"Removed {before - after} duplicates. New shape: {train_df.shape}")
    if max_size > 0:
        train_df = train_df.sample(max_size, random_state=42) 
    # to datasets
    
    train_ds = Dataset.from_pandas(train_df)
    return train_ds

train_dataset = preprocess_text(train_df, config['max_train_samples'])
eval_dataset = preprocess_text(val_df)
# test_df = preprocess_text(test_df)
print(train_dataset[0])
train_dataset = train_dataset.map(
    preprocess_batch,
    batched=True,
    remove_columns=train_dataset.column_names,
)

eval_dataset = eval_dataset.map(
    preprocess_batch,
    batched=True,
    remove_columns=eval_dataset.column_names,
)

Removed 122441 duplicates. New shape: (877559, 2)
Removed 5 duplicates. New shape: (1995, 2)
{'EN': 'Oral hypoglycaemic agents (OHA), monoamine oxidase inhibitors (MAOI), non-selective beta- blocking agents, angiotensin converting enzyme (ACE) inhibitors, salicylates, alcohol, anabolic steroids and sulphonamides.', 'ES': 'Hipoglucemiantes orales (HO), inhibidores de la monoamino oxidasa (IMAO), betabloqueantes no selectivos, inhibidores de la enzima conversora de la angiotensina (IECA), salicilatos, alcohol, esteroides anabolizantes y sulfonamidas.', '__index_level_0__': 41764}


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100000/100000 [00:29<00:00, 3348.56 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1995/1995 [00:00<00:00, 3402.20 examples/s]


In [9]:
# import matplotlib.pyplot as plt

# plt.hist(train_dataset['length'], bins=30)
# plt.title('Distribution of tokenized sequence lengths')
# plt.xlabel('Length')
# plt.ylabel('Count')
# plt.show()

In [10]:
# # def build_prompt(text: str) -> str:
# #     return prompt_template.format(src=text.strip())

# def tokenize_function(example: Dict[str, str]) -> Dict[str, List[int]]:
#     source_text = example[config["source_lang_key"]]
#     target_text = example[config["target_lang_key"]]
#     prompt = build_prompt(source_text)
#     full_text = f"{prompt} {target_text.strip()}"

#     tokenized = tokenizer(full_text, truncation=True, max_length=config["max_length"])
#     labels = tokenized["input_ids"].copy()
#     prompt_ids = tokenizer(prompt, truncation=True, max_length=config["max_length"])["input_ids"]
#     labels[: len(prompt_ids)] = [-100] * len(prompt_ids)
#     tokenized["labels"] = labels
#     return tokenized

# tokenized_datasets = raw_datasets.map(
#     tokenize_function,
#     remove_columns=raw_datasets["train"].column_names,
#     desc="Tokenizing dataset",
# )

In [11]:
# data_collator = DataCollatorForSeq2Seq(
#     tokenizer=tokenizer,
#     padding=True,
#     return_tensors="pt",
#     label_pad_token_id=-100,
# )

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [12]:
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred              # logits: (batch, seq_len, vocab_size)
    pred_ids = np.argmax(logits, axis=-1)   # -> (batch, seq_len)

    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    pred_strs = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_strs = tokenizer.batch_decode(labels, skip_special_tokens=True)

    references = [[ref] for ref in label_strs]
    bleu = metric.compute(predictions=pred_strs, references=references)
    return {"sacrebleu": bleu["score"]}

In [13]:
def init_model():
    model_kwargs = {}
    if config["use_qlora"]:
        compute_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
        )
        model_kwargs["quantization_config"] = quant_config
        model_kwargs["device_map"] = "auto"
    else:
        model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    model = AutoModelForCausalLM.from_pretrained(config["base_model"], **model_kwargs)
    if config["use_qlora"]:
        model = prepare_model_for_kbit_training(model)
    else:
        model = model.to(device)
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)
    return model

In [14]:
model = init_model()
model.print_trainable_parameters()

`torch_dtype` is deprecated! Use `dtype` instead!


trainable params: 1,843,200 || all params: 136,358,208 || trainable%: 1.3517


In [15]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["num_train_epochs"],
    per_device_train_batch_size=config["per_device_train_batch_size"],
    per_device_eval_batch_size=config["per_device_eval_batch_size"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    learning_rate=config["learning_rate"],
    warmup_ratio=config["warmup_ratio"],
    weight_decay=config["weight_decay"],
    logging_steps=config["logging_steps"],
    eval_strategy="steps",
    eval_steps=config["eval_steps"],
    save_strategy="steps",
    save_steps=config["save_steps"],
    save_total_limit=3,
    report_to="none",
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available(),
)
# training_args = TrainingArguments(
#     output_dir=config["output_dir"],
#     overwrite_output_dir=True,
#     num_train_epochs=config["num_train_epochs"],
#     per_device_train_batch_size=config["per_device_train_batch_size"],
#     per_device_eval_batch_size=config["per_device_eval_batch_size"],
#     gradient_accumulation_steps=config["gradient_accumulation_steps"],
#     learning_rate=config["learning_rate"],
#     warmup_ratio=config["warmup_ratio"],
#     lr_scheduler_type="cosine",
#     logging_steps=50,
#     eval_strategy="steps",
#     eval_steps=10,
#     save_strategy="steps",
#     save_steps=500,
#     save_total_limit=3,
#         # ðŸ”‘ The important bits:
#     bf16=torch.cuda.is_available(),        # training in bf16
#     bf16_full_eval=torch.cuda.is_available(),  # eval ALSO in bf16

#     report_to="none",
#     load_best_model_at_end=True,
#     seed=42,
# )

In [16]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
train_result = trainer.train()
train_result

Step,Training Loss,Validation Loss
5000,2.048,2.031664
10000,1.9573,1.96582


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [18]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 1.8940869569778442,
 'eval_runtime': 23.4725,
 'eval_samples_per_second': 84.993,
 'eval_steps_per_second': 21.259,
 'epoch': 5.0}

In [21]:
adapter_dir = os.path.join(config["output_dir"], "lora-adapter")
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
adapter_dir

'outputs/smol-lora/lora-adapter'

In [22]:
# Optional: merge adapters into the base model for export
def merge_and_save(base_model_dir: str, adapter_dir: str, merged_dir: str, dtype=torch.float16):
    base_model = AutoModelForCausalLM.from_pretrained(base_model_dir, torch_dtype=dtype)
    peft_model = PeftModel.from_pretrained(base_model, adapter_dir)
    merged_model = peft_model.merge_and_unload()
    merged_model.save_pretrained(merged_dir)
    tokenizer.save_pretrained(merged_dir)

merge_and_save(config["base_model"], adapter_dir, os.path.join(config["output_dir"], "merged"))