<a href="https://colab.research.google.com/github/jhryals/el-roi-intelligence-triage-system/blob/main/processing/language_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# ==============================================================
# üìÇ Google Drive Integration for EL ROI
# ==============================================================

from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define base project path inside Google Drive
PROJECT_PATH = "/content/drive/MyDrive/el-roi"
DATA_PATH = os.path.join(PROJECT_PATH, "data")

# Create folders if they don't exist
os.makedirs(DATA_PATH, exist_ok=True)

print(f"‚úÖ Project directory set to: {PROJECT_PATH}")
print(f"‚úÖ Data directory set to: {DATA_PATH}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Project directory set to: /content/drive/MyDrive/el-roi
‚úÖ Data directory set to: /content/drive/MyDrive/el-roi/data


In [16]:
# ==============================================================
# üì¶ MODULE SETUP: Install Required Packages
# ==============================================================
# This cell must be run before any other code in the notebook.
# Colab resets its environment on new sessions, so packages
# installed here will only persist for the current runtime.

!pip install langdetect googletrans==4.0.0-rc1 google-cloud-translate==3.11.3



In [17]:
# --------------------------------------------------------------
# MODULE 2: Language Detection + Translation (Configurable)
# --------------------------------------------------------------

import os
import pandas as pd
from langdetect import detect, DetectorFactory

# Ensure langdetect is deterministic
DetectorFactory.seed = 0

# --------------------------------------------------------------
# CONFIGURATION
# --------------------------------------------------------------
TRANSLATION_MODE = "free"   # Options: "free", "gcloud"
GCLOUD_PROJECT_ID = "your-google-cloud-project-id"  # Required for gcloud mode
GCLOUD_CREDENTIALS_PATH = "/path/to/your/service-account.json"  # Required for gcloud mode

# If using Google Cloud, set credentials
if TRANSLATION_MODE == "gcloud":
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCLOUD_CREDENTIALS_PATH

# --------------------------------------------------------------
# FREE MODE: googletrans
# --------------------------------------------------------------
if TRANSLATION_MODE == "free":
    from googletrans import Translator
    free_translator = Translator()

    def translate_text_free(text, target_language="en"):
        """Translate text using free googletrans (unofficial)."""
        if not text.strip():
            return text
        try:
            return free_translator.translate(text, dest=target_language).text
        except Exception as e:
            print(f"‚ö†Ô∏è Free translation failed: {e}")
            return text

# --------------------------------------------------------------
# DEMO MODE: Google Cloud Translate
# --------------------------------------------------------------
if TRANSLATION_MODE == "gcloud":
    from google.cloud import translate

    gcloud_client = translate.TranslationServiceClient()

    def translate_text_gcloud(text, target_language="en"):
        """Translate text using official Google Cloud Translation API."""
        if not text.strip():
            return text
        try:
            response = gcloud_client.translate_text(
                request={
                    "parent": f"projects/{GCLOUD_PROJECT_ID}/locations/global",
                    "contents": [text],
                    "mime_type": "text/plain",
                    "target_language_code": target_language
                }
            )
            return response.translations[0].translated_text
        except Exception as e:
            print(f"‚ö†Ô∏è Google Cloud translation failed: {e}")
            return text

# --------------------------------------------------------------
# LANGUAGE DETECTION
# --------------------------------------------------------------
def detect_language(text):
    """Detect the language code (ISO 639-1) for the given text."""
    try:
        return detect(text)
    except:
        return "unknown"

# --------------------------------------------------------------
# MAIN PROCESSING PIPELINE
# --------------------------------------------------------------
def process_language_pipeline(df):
    """
    Detects language and translates article summaries into English.
    Adds 'lang' and 'translated_summary' columns to DataFrame.
    """
    langs = []
    translations = []

    for idx, row in df.iterrows():
        summary_text = row.get("summary", "")
        if not summary_text:
            langs.append("unknown")
            translations.append("")
            continue

        # Detect language
        lang_code = detect_language(summary_text)
        langs.append(lang_code)

        # Translate based on mode
        if TRANSLATION_MODE == "free":
            translated_text = translate_text_free(summary_text, "en")
        elif TRANSLATION_MODE == "gcloud":
            translated_text = translate_text_gcloud(summary_text, "en")
        else:
            translated_text = summary_text  # No translation

        translations.append(translated_text)

    df["lang"] = langs
    df["translated_summary"] = translations
    return df

import os

# Ensure data directory exists
os.makedirs("data", exist_ok=True)

# Load raw articles from ingestion step
raw_articles_path = os.path.join(DATA_PATH, "raw_articles.jsonl")
if not os.path.exists(raw_articles_path):
    raise FileNotFoundError(f"‚ùå Could not find {raw_articles_path}. Please run rss_ingestion.ipynb first.")

df_articles = pd.read_json(raw_articles_path, lines=True)

# Process language detection + translation
df_translated = process_language_pipeline(df_articles)

# Save processed dataset for downstream modules
translated_articles_path = os.path.join(DATA_PATH, "articles_translated.jsonl")
df_translated.to_json(translated_articles_path, orient="records", lines=True, force_ascii=False)

print(f"‚úÖ Language detection + translation complete.")
print(f"‚úÖ Saved translated articles to {translated_articles_path}")
#df_translated.head()



‚úÖ Language detection + translation complete.
‚úÖ Saved translated articles to /content/drive/MyDrive/el-roi/data/articles_translated.jsonl


Unnamed: 0,source,title,link,published,summary,lang,translated_summary
0,El Pa√≠s,Trump apura el plazo que dio a Putin para dete...,https://elpais.com/internacional/2025-08-06/el...,1754498000000.0,El enviado de la Casa Blanca se ve con Putin e...,es,The White House envoy is seen with Putin in Mo...
1,El Pa√≠s,Estados Unidos dobla los aranceles a la India ...,https://elpais.com/internacional/2025-08-06/ee...,1754491000000.0,Las exportaciones indias estar√°n sujetas a un ...,es,Indian exports will be subject to a final tax ...
2,El Pa√≠s,La detenci√≥n de Bolsonaro tensa las negociacio...,https://elpais.com/america/2025-08-06/la-deten...,1754452000000.0,El Gobierno de Lula se plantea incluir la expo...,es,The Lula Government considers to include the e...
3,El Pa√≠s,La sumisi√≥n de Bruselas a Trump da alas a la e...,https://elpais.com/internacional/2025-08-03/la...,1754192000000.0,Los aliados del estadounidense en Europa logra...,es,American allies in Europe manage to strengthen...
4,El Pa√≠s,Trump ‚Äòhackea‚Äô el sistema econ√≥mico internacional,https://elpais.com/internacional/2025-08-01/tr...,1754066000000.0,La subida de barreras al comercio m√°s intensa ...,es,The rise of barriers to the most intense trade...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

