**//IMPORTS**

In [1]:
import os, gzip, requests, time
import pandas as pd
import pathlib

from datetime import datetime, timezone
from dotenv import load_dotenv         
from pathlib import Path
from opensubtitlescom import OpenSubtitles
from xmlrpc.client import ServerProxy, Error as XMLRPCError

**//CONFIGS**

In [2]:
load_dotenv()

API_BASE = "https://api.opensubtitles.com/api/v1"
API_KEY = os.getenv("OPENSUBTITLES_API_KEY")
USERNAME = os.getenv("OPENSUBTITLES_USER", "")
PASSWORD = os.getenv("OPENSUBTITLES_PASS", "")
USER_AGENT = "MySubtitleApp/1.0"

# REST headers
# REST_HEADERS = {
#     "Api-Key":     API_KEY,
#     "User-Agent":  USER_AGENT,
#     "Content-Type":"application/json"
# }

# HEAD = {
#     "Api-Key": os.getenv("OPENSUBTITLES_API_KEY"),
#     "User-Agent": "TeseCollector/0.1",
#     "Accept": "application/json",
# }

BASE_HEADERS = {
    "Api-Key":     API_KEY,
    "User-Agent":  "MySubtitleApp/1.0",          # <- descriptive!
    "Accept":      "application/json",           # <- important
    "Content-Type":"application/json",
}
AUTH_HEADERS = dict(BASE_HEADERS)       # will gain 'Authorization' below

# XML-RPC client (fallback path)
ost = OpenSubtitles(user_agent=USER_AGENT, api_key=API_KEY)
# optional login for higher quotas
if USERNAME and PASSWORD:
    try:
        ost.login(USERNAME, PASSWORD)
    except Exception:
        pass

# YEARS AND LANGUAGES

YEARS = range(2023, 2024)          
LANGS = {"pt-br", "pt-pt"}             
by_lang = {lang: {} for lang in LANGS}

TIMEOUT = 15



In [3]:
OUTPUT_DIR_BR = "../data/raw/test_br_subs"
OUTPUT_DIR_PT = "../data/raw/test_pt_subs"
os.makedirs(OUTPUT_DIR_BR, exist_ok=True)
os.makedirs(OUTPUT_DIR_PT, exist_ok=True)

**//MOVIES IN OPENSUBTITLES IN PT AND BR**

In [4]:
for lang in LANGS:
    for year in YEARS:
        page = 1
        while True:
            q = {"languages": lang, "year": year, "type": "movie", "page": page}
            r = requests.get(f"{API_BASE}/subtitles", headers=BASE_HEADERS, params=q, timeout=15)
            r.raise_for_status()
            data = r.json()["data"]
            if not data:
                break
            for sub in data:
                feat = sub["attributes"]["feature_details"]
                by_lang[lang][feat["imdb_id"]] = feat["title"]
            page += 1
            time.sleep(1)               # stay under 40 req/10 s
            if page > r.json()["total_pages"]:
                break

both_langs = set(by_lang["pt-br"]) & set(by_lang["pt-pt"])
MOVIES = [(imdb, by_lang["pt-br"][imdb]) for imdb in both_langs]

print(f"{len(MOVIES)} movies with pt-BR *and* pt-PT subtitles (2023-):")
for imdb, title in sorted(MOVIES, key=lambda x: x[1].lower()):
    print(imdb, title)


672 movies with pt-BR *and* pt-PT subtitles (2023-):
24852002 10 Days of a Bad Man
27729024 100 Years of Warner Bros.
24082438 20 Days in Mariupol
14773940 24 Hours with Gaspar
18083578 57 Seconds
12261776 65
18079362 80 for Brady
27861034 85 South: Ghetto Legends
16496386 97 Minutes
15282148 A Beautiful Life
16731908 A Brighter Tomorrow
20414642 A Day and a Half
13022120 A Deadly Invitation
21051906 A Family Affair
14153080 A Good Person
22687790 A Haunting in Venice
29231347 A Heidelberg Holiday
4225012 A Little White Lie
21940010 A Million Miles Away
12427158 A Thousand and One
20115096 A Tourist's Guide to Love
27647417 Aaron Carter: The Little Prince of Pop
27445004 Abang Adik
13231544 About Dry Grasses
8373206 About My Father
26440619 Afire
15334488 After Everything
14746834 After the Bite
16419074 Air
27197387 AKA
20424172 Alibi.com 2
21192142 All of Us Strangers
14748950 All to Play For
27921620 All-Time High
23561236 American Fiction
28865980 American Symphony
27791865 Amy Sch

In [5]:
# MOVIES = pd.read_csv("tmdb_movies_2024_with_imdb.csv")
# MOVIES = MOVIES[MOVIES["imdb_id"].notna()]
# MOVIES = MOVIES.to_dict(orient="records")

# MOVIES = [
#     {"imdb_id": "tt14513804", "title": "Captain America: Brave New World"},
#     {"imdb_id": "tt34463310", "title": "Detective Chinatown 1900"},
# ]

**//FUNCTIONS** Functions to be put in the extract forder

In [6]:
def login() -> str | None:
    """Return 'Bearer <token>' on success, else None (key-only mode)."""
    if not USERNAME or not PASSWORD:
        return None                     # .env has no creds → stay anonymous
    r = requests.post(f"{API_BASE}/login",
                      headers=BASE_HEADERS,
                      json={"username": USERNAME, "password": PASSWORD},
                      timeout=TIMEOUT)
    if r.status_code == 401:
        print("Login failed:", r.json().get("message"))
        return None
    r.raise_for_status()
    return f"Bearer {r.json()['token']}"

bearer = login()
if bearer:
    AUTH_HEADERS["Authorization"] = bearer

def fetch_subtitles_for_br(imdb_id: str):
    resp = requests.get(
        f"{API_BASE}/subtitles",
        headers=AUTH_HEADERS,          #  ← 1) send Api-Key & UA
        params={
            "imdb_id":   imdb_id,
            "languages": "pt-br",    #  ← 2) valid PT codes
            "order_by":  "downloads",
        },
        timeout=TIMEOUT
    )
    resp.raise_for_status()
    hit = next(iter(resp.json().get("data", [])), None)
    if not hit:
        return None            # nothing in that language

    file_id = hit["attributes"]["files"][0]["file_id"]

    # 2) Download
    dl = requests.post(f"{API_BASE}/download",
                       headers=AUTH_HEADERS,
                       json={"file_id": file_id},
                       timeout=TIMEOUT)

    if dl.status_code == 406:          # show the API explanation
        print("Download refused:", dl.json())
        return None
    dl.raise_for_status()

    url  = dl.json()["link"]
    blob = requests.get(url, timeout=TIMEOUT).content
    if url.endswith(".gz"):
        blob = gzip.decompress(blob)

    return blob.decode("utf-8", errors="replace")

def save_all_br(movies):
    for movie in movies:
        if movie['imdb_id'] in OUTPUT_DIR_BR.glob("*.srt"):
            print(f"✓ Already have {movie['imdb_id']} ({movie['title']})")
            continue
        srt = fetch_subtitles_for_br(movie["imdb_id"])
        if not srt:
            print(f"✗ No PT-BR subs for {movie['imdb_id']} ({movie['title']})")
            continue
        out = OUTPUT_DIR_BR / f"{movie['imdb_id']}.srt"
        out.write_text(srt, encoding="utf-8")
        print("✓ Saved:", out)

In [None]:
def fetch_subtitles_for_pt(imdb_id: str):
    resp = requests.get(
        f"{API_BASE}/subtitles",
        headers=AUTH_HEADERS,          #  ← 1) send Api-Key & UA
        params={
            "imdb_id":   imdb_id,
            "languages": "pt-pt",    #  ← 2) valid PT codes
            "order_by":  "downloads",
        },
        timeout=TIMEOUT
    )
    resp.raise_for_status()
    hit = next(iter(resp.json().get("data", [])), None)
    if not hit:
        return None            # nothing in that language

    file_id = hit["attributes"]["files"][0]["file_id"]

    # 2) Download
    dl = requests.post(f"{API_BASE}/download",
                       headers=AUTH_HEADERS,
                       json={"file_id": file_id},
                       timeout=TIMEOUT)

    if dl.status_code == 406:          # show the API explanation
        print("Download refused:", dl.json())
        return None
    dl.raise_for_status()

    url  = dl.json()["link"]
    blob = requests.get(url, timeout=TIMEOUT).content
    if url.endswith(".gz"):
        blob = gzip.decompress(blob)

    return blob.decode("utf-8", errors="replace")

def save_all_pt(movies):
    for movie in movies:
        if movie['imdb_id'] in OUTPUT_DIR_PT.glob("*.srt"):
            print(f"✓ Already have {movie['imdb_id']} ({movie['title']})")
            continue
        srt = fetch_subtitles_for_pt(movie["imdb_id"])
        if not srt:
            print(f"✗ No PT-PT subs for {movie['imdb_id']} ({movie['title']})")
            continue
        out = OUTPUT_DIR_PT / f"{movie['imdb_id']}.srt"
        out.write_text(srt, encoding="utf-8")
        print("✓ Saved:", out)

**//MAIN CODE**

In [None]:
for m in MOVIES:
    if m[1] == 'Gran Turismo':
        print("Gran Turismo")


The Toxic Avenger


(28524545, 'Mystery Island')