#  Title: Automated Translation of Spanish HTML Web Pages to English Using Hugging Face MarianMT and BeautifulSoup

## Introduction
- Fetch an HTML page from a given URL.
- Translate the page from Spanish (es) to English (en) using three different models:
  - MarianMT
  - Seq2Seq (LSTM Approximation)
  - GNMT Approximation (M2M100)
- Save the translated pages locally.

We use the Hugging Face `transformers` library and `BeautifulSoup` for HTML parsing.

##  Install Necessary Packages

In [None]:
!pip install transformers beautifulsoup4 requests

##  Import Libraries

In [None]:
from bs4 import BeautifulSoup
from transformers import (
    MarianMTModel,
    MarianTokenizer,
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
)
import requests
import os
import time

##  Define the Models

In [None]:
models = {
    "MarianMT": {
        "tokenizer": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en"),
        "model": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-es-en"),
        "source_lang": "es",
        "target_lang": "en",
    },
    "Seq2Seq (LSTM Approx)": {
        "tokenizer": MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en"),
        "model": MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-es-en"),
        "source_lang": "es",
        "target_lang": "en",
    },
    "GNMT (Approx - M2M100)": {
        "tokenizer": M2M100Tokenizer.from_pretrained("facebook/m2m100_418M"),
        "model": M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M"),
        "source_lang": "es",
        "target_lang": "en",
    },
}

## Define Translation Functions

In [None]:
def translate_text(text, tokenizer, model, source_lang, target_lang):
    if not text or not text.strip():
        return text
    if isinstance(model, MarianMTModel):
        tokenizer.src_lang = source_lang
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        translated = model.generate(**inputs)
        return tokenizer.decode(translated[0], skip_special_tokens=True)
    elif isinstance(model, M2M100ForConditionalGeneration):
        tokenizer.src_lang = source_lang
        encoded = tokenizer(text, return_tensors="pt")
        generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(target_lang))
        return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    else:
        return text

def translate_html(html, tokenizer, model, source_lang, target_lang):
    soup = BeautifulSoup(html, "html.parser")
    for element in soup.find_all(string=True):
        if element.parent.name in ["script", "style"]:
            continue
        translated = translate_text(element, tokenizer, model, source_lang, target_lang)
        element.replace_with(translated)
    return str(soup)

##  Fetch and Translate the Webpage

In [None]:
if __name__ == "__main__":
    url = input("Enter a URL to translate: ")
    response = requests.get(url)
    response.encoding = 'utf-8'
    html_content = response.text

    for model_name, components in models.items():
        print(f"\n=== Translating using {model_name} ===")
        start_time = time.time()

        english_html = translate_html(
            html_content,
            components["tokenizer"],
            components["model"],
            components["source_lang"],
            components["target_lang"],
        )

        elapsed = time.time() - start_time

        filename = f"translated_page_en_{model_name.replace(' ', '_').replace('(', '').replace(')', '')}.html"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(english_html)

        print(f"{model_name} translation complete! Saved to {os.path.abspath(filename)}")
        print(f"Time taken: {elapsed:.2f} seconds")

##  Conclusion
- Applied different transformer models to real-world HTML translation.
- Compared time taken for translations.
- Saved outputs for inspection.
