In [2]:
!pip install -q transformers datasets evaluate sacrebleu


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from transformers import pipeline
import requests
import textwrap

BOOK_URL = "https://www.gutenberg.org/files/349/349-0.txt"
MODEL_NAME = "Helsinki-NLP/opus-mt-en-es"  # English -> Spanish


In [4]:
response = requests.get(BOOK_URL)
raw_text = response.text
print(raw_text[:1000])  # quick preview


﻿The Project Gutenberg eBook of The Harvester, by Gene Stratton-Porter

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where you are located before
using this eBook.

Title: The Harvester

Author: Gene Stratton-Porter

Release Date: October, 1995 [eBook #349]
[Most recently updated: March 17, 2023]

Language: English

Produced by: Charles Keller and David Widger

*** START OF THE PROJECT GUTENBERG EBOOK THE HARVESTER ***




THE HARVESTER

By Gene Stratton-Porter


Author Of A Girl Of The Limberlost, Freckles, Etc.



                        THIS PORTION
               OF THE LIFE OF A MAN OF TO-DAY
       

In [5]:
start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"

start_idx = raw_text.find(start_marker)
end_idx = raw_text.find(end_marker)

if start_idx != -1 and end_idx != -1:
    text = raw_text[start_idx + len(start_marker):end_idx]
else:
    # fallback: just drop first/last few thousand chars if markers change
    text = raw_text

text = text.strip()
print(text[:1000])


﻿The Project Gutenberg eBook of The Harvester, by Gene Stratton-Porter

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where you are located before
using this eBook.

Title: The Harvester

Author: Gene Stratton-Porter

Release Date: October, 1995 [eBook #349]
[Most recently updated: March 17, 2023]

Language: English

Produced by: Charles Keller and David Widger

*** START OF THE PROJECT GUTENBERG EBOOK THE HARVESTER ***




THE HARVESTER

By Gene Stratton-Porter


Author Of A Girl Of The Limberlost, Freckles, Etc.



                        THIS PORTION
               OF THE LIFE OF A MAN OF TO-DAY
       

In [6]:
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
print(len(paragraphs))
print(paragraphs[0][:500])


1
﻿The Project Gutenberg eBook of The Harvester, by Gene Stratton-Porter

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where you are located bef


In [7]:
translator = pipeline(
    task="translation_en_to_es",
    model=MODEL_NAME
    # you can later add: device=0 if you turn on GPU in Colab
)

# quick smoke test
sample = paragraphs[0][:400]
print("EN:", sample)
print()
translated = translator(sample, max_length=256)[0]["translation_text"]
print("ES:", translated)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cpu


EN: ﻿The Project Gutenberg eBook of The Harvester, by Gene Stratton-Porter

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not loc

ES: El libro electrónico del proyecto Gutenberg de la cosechadora, por Gene Stratton-Porter Este libro electrónico es para el uso de cualquier persona en cualquier parte de los Estados Unidos y la mayoría de otras partes del mundo sin costo y con casi ninguna restricción. Usted puede copiarlo, regalarlo o reutilizarlo bajo los términos de la licencia del proyecto Gutenberg incluido con este libro electrónico o en línea en www.gutenberg.org.


In [8]:
test_set = [
    {
        "english": "Two households, both alike in dignity, in fair Verona where we lay our scene.",
        "spanish_ref": "Dos familias, ambas iguales en dignidad, en la hermosa Verona donde situamos nuestra escena."
    },
    {
        "english": "From ancient grudge break to new mutiny, where civil blood makes civil hands unclean.",
        "spanish_ref": "De un antiguo rencor nace una nueva revuelta, donde la sangre de ciudadanos mancha manos ciudadanas."
    },
    {
        "english": "A pair of star-crossed lovers take their life.",
        "spanish_ref": "Una pareja de amantes malditos por las estrellas se quita la vida."
    },
    {
        "english": "O Romeo, Romeo, wherefore art thou Romeo?",
        "spanish_ref": "¡Oh Romeo, Romeo! ¿Por qué eres tú Romeo?"
    },
    {
        "english": "That which we call a rose by any other name would smell as sweet.",
        "spanish_ref": "Lo que llamamos rosa, con cualquier otro nombre, olería igual de dulce."
    },
    {
        "english": "My only love sprung from my only hate.",
        "spanish_ref": "Mi único amor ha nacido de mi único odio."
    },
    {
        "english": "Good night, good night! Parting is such sweet sorrow.",
        "spanish_ref": "¡Buenas noches, buenas noches! Separarnos es una pena tan dulce."
    },
    {
        "english": "These violent delights have violent ends.",
        "spanish_ref": "Estos placeres violentos tienen finales violentos."
    },
    {
        "english": "Thus with a kiss I die.",
        "spanish_ref": "Así, con un beso, muero."
    },
    {
        "english": "For never was a story of more woe than this of Juliet and her Romeo.",
        "spanish_ref": "Nunca hubo historia de mayor desgracia que la de Julieta y su Romeo."
    }
]
def run_translation_eval(translator, test_set):
    for sample in test_set:
        pred = translator(
            sample["english"],
            max_length=256
        )[0]["translation_text"]
        sample["pred"] = pred
    return test_set
test_default = run_translation_eval(translator, [dict(s) for s in test_set])


In [9]:
import sacrebleu

def compute_bleu(test_data):
    preds = [s["pred"] for s in test_data]
    refs = [[s["spanish_ref"] for s in test_data]]  # list-of-lists
    bleu = sacrebleu.corpus_bleu(preds, refs)
    print("BLEU score:", bleu.score)
    return bleu.score

test_default = run_translation_eval(translator, [dict(s) for s in test_set])
bleu_default = compute_bleu(test_default)


BLEU score: 32.139800288020595


In [10]:
# 1) Default
translator_default = pipeline("translation_en_to_es", model=MODEL_NAME)
test_default = run_translation_eval(translator_default, [dict(s) for s in test_set])
bleu_default = compute_bleu(test_default)

# 2) Beam search (4 beams)
translator_beam4 = pipeline("translation_en_to_es", model=MODEL_NAME, num_beams=4)
test_beam4 = run_translation_eval(translator_beam4, [dict(s) for s in test_set])
bleu_beam4 = compute_bleu(test_beam4)

# 3) Shorter max_length (128)
translator_short = pipeline("translation_en_to_es", model=MODEL_NAME, max_length=128)
test_short = run_translation_eval(translator_short, [dict(s) for s in test_set])
bleu_short = compute_bleu(test_short)

print("BLEU_default:", bleu_default)
print("BLEU_beam4:", bleu_beam4)
print("BLEU_short:", bleu_short)


Device set to use cpu


BLEU score: 32.139800288020595


Device set to use cpu


BLEU score: 32.139800288020595


Device set to use cpu


BLEU score: 32.139800288020595
BLEU_default: 32.139800288020595
BLEU_beam4: 32.139800288020595
BLEU_short: 32.139800288020595


In [11]:
def print_examples(test_data, n=5):
    for i, s in enumerate(test_data[:n], start=1):
        print(f"=== Example {i} ===")
        print("EN :", s["english"])
        print("REF:", s["spanish_ref"])
        print("PRED:", s["pred"])
        print()

print_examples(test_beam4, n=5)


=== Example 1 ===
EN : Two households, both alike in dignity, in fair Verona where we lay our scene.
REF: Dos familias, ambas iguales en dignidad, en la hermosa Verona donde situamos nuestra escena.
PRED: Dos hogares, ambos iguales en dignidad, en la bella Verona donde ponemos nuestra escena.

=== Example 2 ===
EN : From ancient grudge break to new mutiny, where civil blood makes civil hands unclean.
REF: De un antiguo rencor nace una nueva revuelta, donde la sangre de ciudadanos mancha manos ciudadanas.
PRED: Desde el rencor antiguo hasta el motín nuevo, donde la sangre civil contamina las manos civiles.

=== Example 3 ===
EN : A pair of star-crossed lovers take their life.
REF: Una pareja de amantes malditos por las estrellas se quita la vida.
PRED: Un par de amantes cruzados por estrellas se quitan la vida.

=== Example 4 ===
EN : O Romeo, Romeo, wherefore art thou Romeo?
REF: ¡Oh Romeo, Romeo! ¿Por qué eres tú Romeo?
PRED: Romeo, Romeo, ¿por qué eres Romeo?

=== Example 5 ===
EN : 