In [15]:
API_1 = "***************************************"
API_2 = "***************************************"
API_3 = "***************************************"
API_4 = "***************************************"

In [16]:
import os
import re
import time
from pathlib import Path
from typing import List, Dict

import pandas as pd
from tqdm import tqdm
import google.generativeai as genai

In [17]:
INPUT_DIR = Path("/kaggle/input/new-snli/new-snli/")
OUTPUT_DIR = Path("./outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_NAME = os.environ.get("GEMINI_MODEL_NAME", "models/gemini-2.5-flash-lite")
BATCH_SIZE = 15
MAX_RETRIES = 6
BASE_BACKOFF = 8  # seconds

# You already defined API_1..API_4 earlier in your notebook.
# We'll use them directly. If you prefer env vars, say the word and I’ll swap it.

# Define exactly which CSV and which column to read for each output file.
FILE_SPECS: Dict[str, Dict[str, str]] = {
    # file_base -> {"csv_path": "relative/path.csv", "column": "sentence1|sentence2", "api_key_var": "API_1|API_2|..."}
    "dev_df1":  {"csv_path": "dev/sentence1_translations.csv", "column": "sentence1", "api_key_var": "API_1"},
    "dev_df2":  {"csv_path": "dev/sentence2_translations.csv", "column": "sentence2", "api_key_var": "API_2"},
    "test_df1": {"csv_path": "test/sentence1_translations.csv", "column": "sentence1", "api_key_var": "API_3"},
    "test_df2": {"csv_path": "test/sentence2_translations.csv", "column": "sentence2", "api_key_var": "API_4"},
}

In [18]:
def _get_api_key(var_name: str) -> str:
    """Fetch API key from a variable you defined (API_1..API_4)."""
    # Look in globals (this notebook) first; fallback to env var.
    if var_name in globals():
        return globals()[var_name]
    return os.environ.get(var_name, "")

def make_prompt(batch: List[str]) -> str:
    numbered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(batch))
    return (
        f"Translate the following {len(batch)} English sentences into Nigerian Pidgin.\n"
        f"Return ONLY the translations, one per line, numbered 1..{len(batch)}.\n\n"
        f"{numbered}"
    )

_leading_num_re = re.compile(r"^\s*(?:\d+[\.\)\-:]?\s*|\-\s*|\*\s*)")

def parse_translations(text: str, expected_n: int) -> List[str]:
    if not text:
        return ["[TRANSLATION FAILED]"] * expected_n
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    cleaned = [_leading_num_re.sub("", ln).strip() for ln in lines]
    cleaned = [ln for ln in cleaned if ln]
    if len(cleaned) >= expected_n:
        return cleaned[:expected_n]
    if 0 < len(cleaned) < expected_n:
        return cleaned + ["[MISSING LINE]"] * (expected_n - len(cleaned))
    return ["[TRANSLATION FAILED]"] * expected_n

def translate_batch(model, batch: List[str]) -> List[str]:
    prompt = make_prompt(batch)
    backoff = BASE_BACKOFF
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = model.generate_content(prompt)
            text = getattr(resp, "text", "") or ""
            return parse_translations(text, expected_n=len(batch))
        except Exception as e:
            msg = str(e)
            if any(tok in msg for tok in ("429", "rate", "quota", "Resource has been exhausted")):
                time.sleep(backoff); backoff *= 2
            else:
                time.sleep(5)
    return ["[TRANSLATION FAILED]"] * len(batch)

def pick_english_series(df: pd.DataFrame, preferred: str) -> pd.Series:
    """
    Try to use the preferred column ('sentence1' or 'sentence2').
    If it's missing, try a few fallbacks commonly seen in SNLI-style exports.
    """
    candidates = [preferred, preferred.lower(), "text", "english", "sentence"]
    for c in candidates:
        if c in df.columns:
            s = df[c].astype(str)
            return s
    # If nothing matches, just take the first column as a last resort
    s = df.iloc[:, 0].astype(str)
    return s

In [19]:
def process_one(file_base: str, spec: Dict[str, str]):
    csv_path = INPUT_DIR / spec["csv_path"]
    column = spec["column"]
    api_key = _get_api_key(spec["api_key_var"])

    if not csv_path.exists():
        print(f"❌ Missing CSV: {csv_path}")
        return
    if not api_key:
        print(f"❌ Missing API key for {file_base} ({spec['api_key_var']}).")
        return

    # Configure Gemini
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(MODEL_NAME)

    # Load CSV and pick English column
    df_in = pd.read_csv(csv_path)
    english_series = pick_english_series(df_in, preferred=column)
    english_lines = english_series.fillna("").tolist()

    if not any(ln.strip() for ln in english_lines):
        print(f"⚠️ No non-empty lines found in {csv_path}. Skipping.")
        return

    # Batching
    batches = [english_lines[i:i+BATCH_SIZE] for i in range(0, len(english_lines), BATCH_SIZE)]
    pidgin_all: List[str] = []

    for batch in tqdm(batches, desc=f"Translating {file_base}", unit="batch"):
        pidgin_all.extend(translate_batch(model, batch))

    # Build required schema
    out_df = pd.DataFrame({
        "no": range(1, len(english_lines) + 1),
        "english": english_lines,
        "pidgin": pidgin_all
    })

    out_path = OUTPUT_DIR / f"{file_base}.csv"  # exact names requested
    out_df.to_csv(out_path, index=False)
    print(f"✅ Wrote {out_path.resolve()}")

def main():
    for file_base, spec in FILE_SPECS.items():
        process_one(file_base, spec)

if __name__ == "__main__":
    main()

Translating dev_df1: 100%|██████████| 56/56 [03:21<00:00,  3.61s/batch]


✅ Wrote /kaggle/working/outputs/dev_df1.csv


Translating dev_df2: 100%|██████████| 166/166 [10:12<00:00,  3.69s/batch]


✅ Wrote /kaggle/working/outputs/dev_df2.csv


Translating test_df1: 100%|██████████| 112/112 [07:11<00:00,  3.85s/batch]


✅ Wrote /kaggle/working/outputs/test_df1.csv


Translating test_df2: 100%|██████████| 334/334 [20:11<00:00,  3.63s/batch] 

✅ Wrote /kaggle/working/outputs/test_df2.csv



