# XLSum Translation with Google Translate

In [1]:
from dotenv import load_dotenv
import requests
import os
import pandas as pd
import time

load_dotenv()
google_key = os.getenv("GOOGLE_KEY")

In [2]:
print("Google Key loaded:", google_key is not None and len(google_key) > 0)

Google Key loaded: True


In [3]:
def google_translate(source_texts: list, key: str):
    import time  # Ensure time is imported
    url = "https://translation.googleapis.com/language/translate/v2"

    payload = {
        "q": source_texts,
        "target": "tl",
        "format": "text"
    }

    params = {"key": key}

    max_retries = 20  # Increased retries
    for attempt in range(max_retries):
        response = requests.post(url, params=params, json=payload)

        try:
            response_json = response.json()
        except Exception:
            print("Non-JSON response:")
            print(response.text[:500])
            raise

        if response.status_code == 200:
            try:
                translations = [
                    item["translatedText"]
                    for item in response_json["data"]["translations"]
                ]
                return translations
            except Exception as e:
                print("Unexpected response structure:", e)
                print(response_json)
                raise
        elif response.status_code == 403 and ('User Rate Limit Exceeded' in str(response_json) or 'rate limit' in str(response_json).lower()):
            # Aggressive backoff like Bing: 90 + 60*attempt seconds
            wait_time = 90 + (60 * attempt)
            print(f"Rate limit exceeded. Retrying in {wait_time} seconds... (attempt {attempt+1}/{max_retries})")
            time.sleep(wait_time)
        else:
            print("Unexpected error:", response_json)
            raise

    raise Exception("Max retries exceeded for rate limit.")

In [4]:
df_xlsum = pd.read_csv("../../datasets/cleaned/cleaned_xlsum.csv")

In [5]:
texts = df_xlsum['text'].to_list()
summaries = df_xlsum['summary'].to_list()

In [6]:
batch_size = 5  # Like Bing
texts_translated = []
for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    translated_batch = google_translate(batch, google_key)
    texts_translated.extend(translated_batch)
    # No sleep here, backoff handled in function

summaries_translated = []
for i in range(0, len(summaries), batch_size):
    batch = summaries[i:i+batch_size]
    translated_batch = google_translate(batch, google_key)
    summaries_translated.extend(translated_batch)
    # No sleep here, backoff handled in function

In [7]:
df_xlsum['text'] = pd.Series(texts_translated)
df_xlsum['summary'] = pd.Series(summaries_translated)

In [8]:
df_xlsum.to_csv('../../datasets/translated/google/google_translated_xlsum.csv', index=False)