In [12]:
from sacrebleu.metrics import BLEU
from bert_score import BERTScorer
from nltk.translate.meteor_score import meteor_score
from nltk.translate.chrf_score import sentence_chrf

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import boto3
from botocore.exceptions import ClientError

def translate(text, target_language="Spanish"):
    """
    Translate text using AWS Bedrock Converse API
    
    Args:
        text (str): Text to translate
        target_language (str): Target language for translation
    
    Returns:
        str: Translated text
    """
    # Initialize the Bedrock Runtime client
    client = boto3.client("bedrock-runtime")
    
    # Set the model ID
    model_id = "openai.gpt-oss-20b-1:0"
    
    # Set up messages for translation
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "text": f"Translate the following text to {target_language}:\n\n{text}"
                }
            ]
        }
    ]
    
    system = [
        {
            "text": "You are a professional translator. Provide only the translated text without any additional explanation or commentary."
        }
    ]
    
    try:
        # Send the message to the model
        response = client.converse(
            modelId=model_id,
            messages=messages,
            system=system,
            inferenceConfig={
                "maxTokens": 500, 
                "temperature": 0.3,  # Lower temperature for more consistent translations
                "topP": 0.9
            },
        )
        
        # Extract the translated text
        translated_text = ""
        for content_block in response["output"]["message"]["content"]:
            if "text" in content_block:
                translated_text += content_block["text"]
        
        return translated_text.strip()
        
    except (ClientError, Exception) as e:
        print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
        return None

In [None]:
def evaluate_translation(references, translations):
    reference_text = references[0] 
    translated_text = translations[0]

    print(f"Reference: {reference_text}")
    print(f"Translation: {translated_text}")
    print()
    # BLEU Score - expects list of sentences, but for single paragraph just use as-is
    bleu_score = BLEU().corpus_score([translated_text], [[reference_text]]).score
    print(f"BLEU Score: {bleu_score}")

    # BERTScore - works directly with strings
    scorer = BERTScorer(model_type='bert-base-uncased', lang='en')
    P, R, F1 = scorer.score([translated_text], [reference_text])
    bert_f1 = F1.mean().item()
    print(f"BERTScore F1: {bert_f1}")

    # METEOR Score - needs word tokens (split by spaces)
    translated_tokens = translated_text.split()  # ['The', 'cat', 'is', 'on', 'the', 'rug.']
    reference_tokens = [reference_text.split()]  # [['The', 'cat', 'is', 'on', 'the', 'mat.']]
    meteor_result = meteor_score(reference_tokens, translated_tokens)
    print(f"METEOR Score: {meteor_result}")

    # ChrF Score - works directly with strings (character n-grams)
    chrf_result = sentence_chrf(
        reference=reference_text,
        hypothesis=translated_text,
        min_len=1,
        max_len=6, 
        beta=3.0
    )
    print(f"ChrF Score: {chrf_result}")

    return {
        "BLEU": bleu_score,
        "BERTScore_F1": bert_f1,
        "METEOR": meteor_result,
        "ChrF": chrf_result
    }

In [5]:
# Load a real translation dataset for evaluation
from datasets import load_dataset

def load_translation_data(language_pair="de", dataset_name="wmt14", split="test", max_samples=10):
    """
    Load translation dataset for evaluation
    
    Args:
        language_pair (str): Target language (de, fr, es, etc.)
        dataset_name (str): Dataset to use (wmt14, opus_books, etc.)
        split (str): Dataset split (test, validation)
        max_samples (int): Number of samples to load
    
    Returns:
        tuple: (source_texts, reference_translations)
    """
    try:
        # Load WMT14 EN->DE as example
        if dataset_name == "wmt14":
            dataset = load_dataset("wmt14", f"de-en", split=split)
        elif dataset_name == "opus_books":
            dataset = load_dataset("opus_books", f"en-{language_pair}", split="train")
        
        # Extract English and target language texts
        source_texts = []
        reference_translations = []
        
        for i, example in enumerate(dataset):
            if i >= max_samples:
                break
                
            if dataset_name == "wmt14":
                source_texts.append(example['translation']['en'])
                reference_translations.append(example['translation']['de'])
            elif dataset_name == "opus_books":
                source_texts.append(example['translation']['en'])
                reference_translations.append(example['translation'][language_pair])
        
        return source_texts, reference_translations
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return [], []

# Example usage - load sample data
print("Loading translation dataset...")
english_texts, german_references = load_translation_data(language_pair="de", max_samples=3)

for i, (en, de) in enumerate(zip(english_texts, german_references)):
    print(f"\nSample {i+1}:")
    print(f"English: {en}")
    print(f"German Reference: {de}")
    print("-" * 50)

Loading translation dataset...

Sample 1:
English: Gutach: Increased safety for pedestrians
German Reference: Gutach: Noch mehr Sicherheit für Fußgänger
--------------------------------------------------

Sample 2:
English: They are not even 100 metres apart: On Tuesday, the new B 33 pedestrian lights in Dorfparkplatz in Gutach became operational - within view of the existing Town Hall traffic lights.
German Reference: Sie stehen keine 100 Meter voneinander entfernt: Am Dienstag ist in Gutach die neue B 33-Fußgängerampel am Dorfparkplatz in Betrieb genommen worden - in Sichtweite der älteren Rathausampel.
--------------------------------------------------

Sample 3:
English: Two sets of lights so close to one another: intentional or just a silly error?
German Reference: Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?
--------------------------------------------------


In [8]:
# run translation to get preds
preds = [translate(text, "German") for text in english_texts]

In [9]:
preds

['Gutach: Erhöhte Sicherheit für Fußgänger',
 'Sie sind nicht einmal 100\u202fMeter voneinander entfernt: Am Dienstag wurden die neuen B\u202f33‑Fußgängerampeln im Dorfparkplatz in Gutach in Betrieb genommen – im Blickfeld der bestehenden Rathausampeln.',
 'Zwei Lichtgruppen so nah beieinander: absichtlich oder einfach ein dummer Fehler?']

In [14]:
# run eval on preds vs refs
evaluate_translation(german_references, preds)

Reference: Gutach: Noch mehr Sicherheit für Fußgänger
Translation: Gutach: Erhöhte Sicherheit für Fußgänger

BLEU Score: 32.159351091190125
BERTScore F1: 0.9155959486961365
METEOR Score: 0.635593220338983
ChrF Score: 0.7125176153510376


{'BLEU': 32.159351091190125,
 'BERTScore_F1': 0.9155959486961365,
 'METEOR': 0.635593220338983,
 'ChrF': 0.7125176153510376}