In [1]:
# pip install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM-135M"
device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [2]:
# Basic greedy generation helper tailored for prompting workflows.
def generate(prompt, max_new_tokens=50, return_full_text=False, **generate_kwargs):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generate_kwargs = {"do_sample": False, **generate_kwargs}
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, **generate_kwargs)

    if return_full_text:
        return tokenizer.decode(output_ids[0], skip_special_tokens=True)

    prompt_length = inputs["input_ids"].shape[-1]
    generated_tokens = output_ids[0][prompt_length:]
    return tokenizer.decode(generated_tokens, skip_special_tokens=True)

In [3]:
generate("Once upon a time")

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


', there was a curious little girl named Lily. She loved exploring her backyard and playing with her friends. One day, she noticed that her favorite toy car was missing. She asked her mom, “Mommy, where did my toy car go?”'

In [4]:
from datasets import load_dataset

ds = load_dataset("loresiensis/corpus-en-es")

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['EN', 'ES'],
        num_rows: 9439
    })
    test: Dataset({
        features: ['EN', 'ES'],
        num_rows: 1049
    })
})

In [6]:
# pip install evaluate sacrebleu
import evaluate
from tqdm.auto import tqdm

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")


def prompt_translate_en_to_es(text, max_new_tokens=120):
    prompt = (
        "Translate the English sentence into Spanish:"
        f"English: {text.strip()}\n"
        "Spanish:"
    )
    return generate(prompt, max_new_tokens=max_new_tokens, do_sample=False).strip()


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [7]:
def evaluate_translation_split(split, max_samples=100):
    if max_samples is not None:
        subset_size = min(len(split), max_samples)
        eval_split = split.select(range(subset_size))
    else:
        subset_size = len(split)
        eval_split = split

    predictions = []
    references = []

    for example in tqdm(eval_split, desc="Translating EN→ES"):
        prediction = prompt_translate_en_to_es(example["EN"])
        predictions.append(prediction)
        references.append([example["ES"]])

    sacrebleu_score = sacrebleu.compute(predictions=predictions, references=references)["score"]
    chrf_score = chrf.compute(predictions=predictions, references=references)["score"]

    return {
        "num_eval_samples": subset_size,
        "sacrebleu": sacrebleu_score,
        "chrf": chrf_score,
    }


In [12]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [13]:
def evaluate_pipeline_translation(split, src="ES", tgt="EN", max_samples=100, batch_size=16):
    """
    Evaluate the existing `pipe` translation pipeline on a dataset split.
    Assumes `pipe`, `sacrebleu`, and `chrf` are already available in the notebook.
    By default evaluates ES -> EN (src="ES", tgt="EN"). Adjust src/tgt as needed.
    """
    if max_samples is not None:
        subset_size = min(len(split), max_samples)
    else:
        subset_size = len(split)

    # Get lists of source texts and references
    src_texts = split[src][:subset_size]
    tgt_texts = split[tgt][:subset_size]

    predictions = []
    # run pipeline in batches for efficiency
    for i in tqdm(range(0, subset_size, batch_size), desc="Translating with pipeline"):
        batch_texts = src_texts[i : i + batch_size]
        outputs = pipe(batch_texts)  # pipeline returns list of dicts with 'translation_text'
        batch_preds = [out["translation_text"].strip() for out in outputs]
        predictions.extend(batch_preds)

    # prepare references in the format expected by the metrics: list[list[str]]
    references = [[r] for r in tgt_texts]

    sacrebleu_score = sacrebleu.compute(predictions=predictions, references=references)["score"]
    chrf_score = chrf.compute(predictions=predictions, references=references)["score"]

    return {
        "num_eval_samples": subset_size,
        "sacrebleu": sacrebleu_score,
        "chrf": chrf_score,
        "example_predictions": predictions[:5],  # a few examples for quick inspection
    }

# Example usage on the test split (adjust max_samples/batch_size as desired)
pipeline_test_metrics = evaluate_pipeline_translation(ds["test"], src="EN", tgt="ES", max_samples=100, batch_size=32)
pipeline_test_metrics

Translating with pipeline:   0%|          | 0/4 [00:00<?, ?it/s]

{'num_eval_samples': 100,
 'sacrebleu': 44.271848996984225,
 'chrf': 66.89272505410729,
 'example_predictions': ['Los ciudadanos han perdido la confianza en los sistemas nacionales y europeos de seguridad alimentaria después de los escándalos y los miedos con la carne de vacuno, E.coli, la listeria, la salmonela, las dioxinas, los huevos, las aves de corral, la leche y las hormonas.',
  'Me limitaré a añadir a las propuestas de la Comisión de Asuntos Jurídicos, junto con la señora Niebler –la ponente alternativa que realmente ha hecho un excelente trabajo, como quedó claro en su intervención de ayer también– una enmienda que me parece especialmente importante, que pretende aclarar la definición de operador, que, tal como está redactada en la actualidad, no permite excluir a los bancos y otras instituciones financieras implicadas de la responsabilidad por los daños medioambientales causados por los operadores financieros.',
  'Algunos de estos derechos ya son "de aplicación legal", es d

In [8]:
test_eval_metrics = evaluate_translation_split(ds["test"], max_samples=100)
test_eval_metrics

Translating EN→ES:   0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for o

{'num_eval_samples': 100,
 'sacrebleu': 0.47824330900929374,
 'chrf': 19.887279314986475}