<a href="https://colab.research.google.com/github/judeavery/CS4372Assignment4/blob/main/Report/CS4372Assignment4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets evaluate sacrebleu


In [None]:
from transformers import pipeline
import requests
import textwrap

BOOK_URL = "https://www.gutenberg.org/files/349/349-0.txt"
MODEL_NAME = "Helsinki-NLP/opus-mt-en-es"  # English -> Spanish


In [None]:
response = requests.get(BOOK_URL)
raw_text = response.text
print(raw_text[:1000])  # quick preview


In [None]:
start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"

start_idx = raw_text.find(start_marker)
end_idx = raw_text.find(end_marker)

if start_idx != -1 and end_idx != -1:
    text = raw_text[start_idx + len(start_marker):end_idx]
else:
    # fallback: just drop first/last few thousand chars if markers change
    text = raw_text

text = text.strip()
print(text[:1000])


In [None]:
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
print(len(paragraphs))
print(paragraphs[0][:500])


In [None]:
translator = pipeline(
    task="translation_en_to_es",
    model=MODEL_NAME
    # you can later add: device=0 if you turn on GPU in Colab
)

# quick smoke test
sample = paragraphs[0][:400]
print("EN:", sample)
print()
translated = translator(sample, max_length=256)[0]["translation_text"]
print("ES:", translated)


In [None]:
test_set = [
    {
        "english": "Two households, both alike in dignity, in fair Verona where we lay our scene.",
        "spanish_ref": "Dos familias, ambas iguales en dignidad, en la hermosa Verona donde situamos nuestra escena."
    },
    {
        "english": "From ancient grudge break to new mutiny, where civil blood makes civil hands unclean.",
        "spanish_ref": "De un antiguo rencor nace una nueva revuelta, donde la sangre de ciudadanos mancha manos ciudadanas."
    },
    {
        "english": "A pair of star-crossed lovers take their life.",
        "spanish_ref": "Una pareja de amantes malditos por las estrellas se quita la vida."
    },
    {
        "english": "O Romeo, Romeo, wherefore art thou Romeo?",
        "spanish_ref": "¡Oh Romeo, Romeo! ¿Por qué eres tú Romeo?"
    },
    {
        "english": "That which we call a rose by any other name would smell as sweet.",
        "spanish_ref": "Lo que llamamos rosa, con cualquier otro nombre, olería igual de dulce."
    },
    {
        "english": "My only love sprung from my only hate.",
        "spanish_ref": "Mi único amor ha nacido de mi único odio."
    },
    {
        "english": "Good night, good night! Parting is such sweet sorrow.",
        "spanish_ref": "¡Buenas noches, buenas noches! Separarnos es una pena tan dulce."
    },
    {
        "english": "These violent delights have violent ends.",
        "spanish_ref": "Estos placeres violentos tienen finales violentos."
    },
    {
        "english": "Thus with a kiss I die.",
        "spanish_ref": "Así, con un beso, muero."
    },
    {
        "english": "For never was a story of more woe than this of Juliet and her Romeo.",
        "spanish_ref": "Nunca hubo historia de mayor desgracia que la de Julieta y su Romeo."
    }
]
def run_translation_eval(translator, test_set):
    for sample in test_set:
        pred = translator(
            sample["english"],
            max_length=256
        )[0]["translation_text"]
        sample["pred"] = pred
    return test_set
test_default = run_translation_eval(translator, [dict(s) for s in test_set])


In [None]:
import sacrebleu

def compute_bleu(test_data):
    preds = [s["pred"] for s in test_data]
    refs = [[s["spanish_ref"] for s in test_data]]  # list-of-lists
    bleu = sacrebleu.corpus_bleu(preds, refs)
    print("BLEU score:", bleu.score)
    return bleu.score

test_default = run_translation_eval(translator, [dict(s) for s in test_set])
bleu_default = compute_bleu(test_default)


In [None]:
# 1) Default
translator_default = pipeline("translation_en_to_es", model=MODEL_NAME)
test_default = run_translation_eval(translator_default, [dict(s) for s in test_set])
bleu_default = compute_bleu(test_default)

# 2) Beam search (4 beams)
translator_beam4 = pipeline("translation_en_to_es", model=MODEL_NAME, num_beams=4)
test_beam4 = run_translation_eval(translator_beam4, [dict(s) for s in test_set])
bleu_beam4 = compute_bleu(test_beam4)

# 3) Shorter max_length (128)
translator_short = pipeline("translation_en_to_es", model=MODEL_NAME, max_length=128)
test_short = run_translation_eval(translator_short, [dict(s) for s in test_set])
bleu_short = compute_bleu(test_short)

print("BLEU_default:", bleu_default)
print("BLEU_beam4:", bleu_beam4)
print("BLEU_short:", bleu_short)


In [None]:
def print_examples(test_data, n=5):
    for i, s in enumerate(test_data[:n], start=1):
        print(f"=== Example {i} ===")
        print("EN :", s["english"])
        print("REF:", s["spanish_ref"])
        print("PRED:", s["pred"])
        print()

print_examples(test_beam4, n=5)
