**//IMPORTS**

In [1]:
import os, gzip, requests, time
import pandas as pd
import pathlib, sys
import re
import matplotlib.pyplot as plt

from collections import Counter
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv         
from pathlib import Path
from opensubtitlescom import OpenSubtitles
from xmlrpc.client import ServerProxy, Error as XMLRPCError

project_root = pathlib.Path().resolve().parents[0] 
print(f"Project root: {project_root}")
sys.path.insert(0, str(project_root / "src"))
from db import connect     
from extract.access_open_subtitles import download_srt
from transform.align_subtitles import eliminate_new_lines, auto_sync_subs, align_subtitles_optimal_hungarian
from load.load_subtitles import load_subtitles
from catalog import collect_movies
from transform.align_subtitles import merge_subtitle_fragments


Project root: /home/rofarate/Thesis
OpenSubtitles: logged-in session OK.


**//CONFIGS**

In [2]:
load_dotenv()

API_BASE = "https://api.opensubtitles.com/api/v1"
API_KEY = os.getenv("OPENSUBTITLES_API_KEY")
USERNAME = os.getenv("OPENSUBTITLES_USER", "")
PASSWORD = os.getenv("OPENSUBTITLES_PASS", "")
USER_AGENT = "MySubtitleApp/1.0"

# REST headers
# REST_HEADERS = {
#     "Api-Key":     API_KEY,
#     "User-Agent":  USER_AGENT,
#     "Content-Type":"application/json"
# }

# HEAD = {
#     "Api-Key": os.getenv("OPENSUBTITLES_API_KEY"),
#     "User-Agent": "TeseCollector/0.1",
#     "Accept": "application/json",
# }

BASE_HEADERS = {
    "Api-Key":     API_KEY,
    "User-Agent":  "MySubtitleApp/1.0",          # <- descriptive!
    "Accept":      "application/json",           # <- important
    "Content-Type":"application/json",
}
AUTH_HEADERS = dict(BASE_HEADERS)       # will gain 'Authorization' below

# XML-RPC client (fallback path)
ost = OpenSubtitles(user_agent=USER_AGENT, api_key=API_KEY)
# optional login for higher quotas
if USERNAME and PASSWORD:
    try:
        ost.login(USERNAME, PASSWORD)
    except Exception:
        pass

# YEARS AND LANGUAGES

YEARS = range(2023, 2024)          
LANGS = {"pt-br", "pt-pt"}             
by_lang = {lang: {} for lang in LANGS}

TIMEOUT = 15

tokenize = lambda text: re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", text.lower())


In [3]:
p = Path("../data/duckdb")
p.mkdir(parents=True, exist_ok=True)

DUCKDB_PATH = Path("../data/duckdb/subs.duckdb")
PROCESSED_DIR = Path("../data/processed")        # <-- where anything derived lands
TIME_WINDOW = 60  

OS_API_KEY = os.getenv("OPENSUBTITLES_API_KEY") 

In [4]:
def td_to_srt(ts) -> str:
    """
    Convert a pandas Timedelta *or* an integer millisecond offset
    to the SRT time-stamp string  HH:MM:SS,mmm
    """
    if isinstance(ts, (int, float)):             # already ms
        total_ms = int(ts)
    else:                                        # Timedelta  → ms
        total_ms = int(ts.total_seconds() * 1000)

    hours,   rem = divmod(total_ms, 3_600_000)
    minutes, rem = divmod(rem,        60_000)
    seconds, ms  = divmod(rem,         1_000)
    return f"{hours:02}:{minutes:02}:{seconds:02},{ms:03}"

def calculate_sentence_difference(row):
    pt_vocab = Counter(tokenize(row.text_pt))
    br_vocab = Counter(tokenize(row.text_br))
    shared   = set(pt_vocab) & set(br_vocab)
    return row.pair_id, shared

**//MAIN CODE**

In [5]:
con = connect()                     # ← runs the DDL, makes subs.duckdb
print(con.execute("PRAGMA show_tables").fetchall())
con.close()

[('movies',), ('subtitle_pairs',)]


In [6]:
total = collect_movies(range(2023, 2024))      # 2023 only
print("movies in DB:", total)


movies in DB: 672


In [7]:
con = connect()    
movies = con.execute("SELECT imdb_id, title FROM movies").fetchall()
len(movies)

672

In [8]:

movies = movies[:10]
print(movies)

for imdb_id, title in movies:
    try:
        movie_id = con.execute(
            "SELECT movie_id FROM movies WHERE imdb_id = ?", 
            [imdb_id]
        ).fetchone()[0]

        # 2) skip if we've already stored pairs for this movie
        already = con.execute(
            "SELECT COUNT(*) FROM subtitle_pairs WHERE movie_id = ?", 
            [movie_id]
        ).fetchone()[0]
        if already:
            print(f"→ Skipping {imdb_id} ({title}), already in DB")
            continue
        # ---- download or reuse cache ------------------------------------
        pt_path = download_srt(imdb_id, "pt-PT", title=title)
        br_path = download_srt(imdb_id, "pt-BR", title=title)

        # ---- load & quick clean -----------------------------------------
        subs_pt = load_subtitles(pt_path)
        subs_br = load_subtitles(br_path)
        eliminate_new_lines(subs_pt)
        eliminate_new_lines(subs_br)

        # ---- sentence-level merge (optional but helps alignment) --------
        subs_pt = merge_subtitle_fragments(subs_pt, gap_threshold=pd.Timedelta(seconds=0.2))
        subs_br = merge_subtitle_fragments(subs_br, gap_threshold=pd.Timedelta(seconds=0.2))

        # ---- auto sync --------------------------------------------------
        shifted_pt, final_offset = auto_sync_subs(subs_pt, subs_br)

        # ---- fine alignment --------------------------------------------
        pairs = align_subtitles_optimal_hungarian(shifted_pt, subs_br)

        rows = []
        pair_no = 1
        for pt, br, score in pairs:
            if br is None:
                continue                   # skip unmatched lines

            rows.append({
                "imdb_id": imdb_id,
                "pair_no":  pair_no,
                #"pair_no":  pair_no,
                # store the raw millis             (handy for SQL filtering)
                "start_pt_ms": int(pt["start"].total_seconds() * 1000),
                "end_pt_ms":   int(pt["end"].total_seconds()   * 1000),
                "start_br_ms": int(br["start"].total_seconds() * 1000),
                "end_br_ms":   int(br["end"].total_seconds()   * 1000),
                # and the pretty SRT strings       (for export / debugging)
                # "start_pt_ms": td_to_srt(pt["start"]),
                # "end_pt_ms":   td_to_srt(pt["end"]),
                # "start_br_ms": td_to_srt(br["start"]),
                # "end_br_ms":   td_to_srt(br["end"]),
                "text_pt": pt["text"],
                "text_br": br["text"],
                "score":   float(score),
            })
            pair_no += 1

        pairs_df = pd.DataFrame(rows)
        #con.execute("DELETE FROM subtitle_pairs WHERE imdb_id = ?", [imdb_id])
        con.register("pairs_df", pairs_df)

        con.execute("""
            INSERT INTO subtitle_pairs (
            movie_id,
            pair_no,
            start_pt_ms,
            end_pt_ms,
            text_pt,
            start_br_ms,
            end_br_ms,
            text_br,
            score
            )
            SELECT
            m.movie_id,
            p.pair_no,
            p.start_pt_ms,
            p.end_pt_ms,
            p.text_pt,
            p.start_br_ms,
            p.end_br_ms,
            p.text_br,
            p.score
            FROM pairs_df AS p
            JOIN movies   AS m
            ON p.imdb_id = m.imdb_id
            """)
        
        print(f"✓ Movie {imdb_id} ({title}) in the database")

    except RuntimeError as e:        # e.g. no subtitles, 5xx after retries
        print("✗", imdb_id, title, "→", e)
    except Exception as e:
        print("⚠️  unexpected failure on", imdb_id, ":", e)

[('14230458', 'Poor Things'), ('15764854', 'Fireworks'), ('26449465', 'Peak Season'), ('19883634', 'The Old Oak'), ('27805677', 'Fairlane'), ('20465746', "L'ultima notte di Amore"), ('26671415', 'Seven Veils'), ('22687790', 'A Haunting in Venice'), ('5478456', 'We Are Zombies'), ('18072882', 'Northern Comfort')]
→ Skipping 14230458 (Poor Things), already in DB
→ Skipping 15764854 (Fireworks), already in DB
→ Skipping 26449465 (Peak Season), already in DB
→ Skipping 19883634 (The Old Oak), already in DB
Offset +0.110s (max 0.230)  Coverage 0.104  Mean 0.947
Offset +0.000s (max 0.283)  Coverage 0.104  Mean 0.947
Offset +0.000s (max 0.283)  Coverage 0.104  Mean 0.947
plateau → stop
✓ Movie 27805677 (Fairlane) in the database
⚠️  unexpected failure on 20465746 : 503 Server Error: Service Unavailable for url: https://api.opensubtitles.com/api/v1/download
Offset -0.002s (max 1.869)  Coverage 0.101  Mean 0.969
Offset +0.000s (max 1.871)  Coverage 0.101  Mean 0.969
Offset +0.000s (max 1.871)  

In [9]:
df_subtitle_pairs = con.execute("SELECT * FROM subtitle_pairs").df()
print(f"Subtitle pairs in DB: {len(df_subtitle_pairs)}")
low_scores_in_db = df_subtitle_pairs[df_subtitle_pairs.score < 0.4]
medium_scores_in_db = df_subtitle_pairs[(df_subtitle_pairs.score >= 0.4) & (df_subtitle_pairs.score < 0.7)]
high_scores_in_db = df_subtitle_pairs[df_subtitle_pairs.score >= 0.7]


Subtitle pairs in DB: 9161


In [10]:
pt_pt_vocab = Counter(
    token
    for sent in df_subtitle_pairs.text_pt
    for token in tokenize(sent)
)
pt_br_vocab = Counter(
    token
    for sent in df_subtitle_pairs.text_br
    for token in tokenize(sent)
)

shared       = set(pt_pt_vocab) & set(pt_br_vocab)
unique_pt_pt = set(pt_pt_vocab) - set(pt_br_vocab)
unique_pt_br = set(pt_br_vocab) - set(pt_pt_vocab)

print("Vocab sizes:", len(pt_pt_vocab), len(pt_br_vocab))
print("Shared types:",    len(shared))
print("Unique PT-PT types:", len(unique_pt_pt))
print("Unique PT-BR types:", len(unique_pt_br))

# Top-20 distinctive tokens on each side
pt_pt_distinct = sorted(unique_pt_pt, key=lambda t: pt_pt_vocab[t], reverse=True)[:20]
pt_br_distinct = sorted(unique_pt_br, key=lambda t: pt_br_vocab[t], reverse=True)[:20]
print("Top PT-PT distinct:", pt_pt_distinct)
print("Top PT-BR distinct:", pt_br_distinct)


Vocab sizes: 7157 6740
Shared types: 4482
Unique PT-PT types: 2675
Unique PT-BR types: 2258
Top PT-PT distinct: ['estás', 'tens', 'sabes', 'vos', 'queres', 'podes', 'pá', 'ca', 'vais', 'mna', 'contigo', 'porquê', 'pára', 'demasiado', 'câmara', 'totò', 'véni', 'fizeste', 'sítio', 'facto']
Top PT-BR distinct: ['font', 'color', 'ffff', 'pra', 'srta', 'falando', 'tá', 'indo', 'tentando', 'câmera', 'acontecendo', 'peguei', 'h', 'entende', 'detalhes', 'vendo', 'né', 'ouvindo', 'trabalhando', 'entendi']


**//GOOD SCORES ANALYSIS**

In [12]:
good_row_count = len(high_scores_in_db)
print(f"Subtitle pairs in DB: {good_row_count}")
high_scores_in_db

Subtitle pairs in DB: 5735


Unnamed: 0,pair_id,movie_id,pair_no,start_pt_ms,end_pt_ms,text_pt,start_br_ms,end_br_ms,text_br,score
0,1,1,1,980,5938,CONTÉM REPRESENTAÇÕES DE PRODUTOS DE TABACO,1000,6000,HÁ CENAS DE USO DE TABACO,0.71290
1,2,1,2,111772,118272,POBRES CRIATURAS,111800,118100,POBRES CRIATURAS,0.99986
2,3,1,3,286522,288480,Adeus!,286542,288500,Adeus.,0.88090
5,6,1,6,301147,305605,Quem gostaria de reconstruir os órgãos?,301167,305625,"Bem, quem gostaria de recolocar os órgãos?",0.85290
6,7,1,7,305772,310772,"E quem consegue distinguir humano de animal, i...",305792,310792,"E quem pode distinguir o humano do animal, se ...",0.87390
...,...,...,...,...,...,...,...,...,...,...
9156,9157,10,1162,5270708,5273916,Estaremos a comer Currywurst em Römerberg ante...,5270708,5273916,Estaremos comendo Currywurst em Römerberg ante...,0.84600
9157,9158,10,1163,5275708,5278916,"Muito bem! Segurem-se bem, pessoal. Vai ficar ...",5275708,5278916,"Muito bem! Segurem-se bem, pessoal. Vai ficar ...",0.95800
9158,9159,10,1164,5279000,5280708,"Muito bem, jóquei.",5279000,5280708,"Muito bem, jóquei.",1.00000
9159,9160,10,1165,5280791,5282125,Pronto para evacuar.,5280791,5282125,Pronto para evacuar.,1.00000


**//BAD SCORES ANALYSIS**

In [13]:
bad_row_count = len(low_scores_in_db)
print(f"Low-score pairs in DB: {bad_row_count}")
low_scores_in_db

Low-score pairs in DB: 161


Unnamed: 0,pair_id,movie_id,pair_no,start_pt_ms,end_pt_ms,text_pt,start_br_ms,end_br_ms,text_br,score
192,193,1,193,1352272,1353355,E para sempre.,1363667,1367458,ou prisões mal equipadas e poucos funcionários?,0.355025
401,402,1,402,2526522,2527688,Uma pândega!,2526542,2527708,Delícia.,0.369900
1186,1187,1,1187,6586480,6587355,Armou-me uma cilada!,6549125,6552542,Um diabo em um corpo sedutor e insaciável...,0.330225
1206,1207,1,1207,6667938,6670688,{\an8}FALECIDO em Breve,6641500,6644292,Idiota com cara de boceta.,0.370810
1244,1245,1,1245,6846647,6847522,Cabra!,6835167,6836167,Bam.,0.382600
...,...,...,...,...,...,...,...,...,...,...
6454,6455,7,1055,5247851,5249251,Ooh. Incrível.,5241347,5242348,Como assim?,0.379480
6517,6518,7,1118,5764033,5765535,-A venda é transparente.,5799071,5800072,Desculpa.,0.334810
6523,6524,7,1124,5798901,5801137,-Sinto muito.,5833189,5834273,Pronto?,0.338560
7699,7700,8,1137,4616250,4618208,Porque é que ele tinha de estar sozinho?,4565333,4567000,O que aconteceu? O que deu nele?,0.339415


**//MEDIUM SCORES ANALYSIS**

In [14]:
medium_row_count = len(medium_scores_in_db)
print(f"Medium-score pairs in DB: {medium_row_count}")
medium_scores_in_db

Medium-score pairs in DB: 3265


Unnamed: 0,pair_id,movie_id,pair_no,start_pt_ms,end_pt_ms,text_pt,start_br_ms,end_br_ms,text_br,score
3,4,1,4,291647,297688,Uma pilha de órgãos sem a centelha de humanida...,291667,295458,Um monte de órgãos sem a centelha de si na men...,0.6779
4,5,1,5,297855,300688,Parece a banca de um talhante para um banquete.,297875,300708,É só um prato do açougueiro para o almoço de d...,0.5519
7,8,1,8,313938,314772,Vá lá!,313958,314792,Vamos!,0.5309
8,9,1,9,315480,318813,"Fizeram puzzles em crianças, certo?",315500,318833,"Montavam quebra-cabeças quando crianças, não?",0.6499
13,14,1,14,333897,336063,A tua proximidade não quer dizer nada.,333917,336083,Sua proximidade física conosco não o inclui nela.,0.6989
...,...,...,...,...,...,...,...,...,...,...
9049,9050,10,1055,4764000,4765416,"Está bem, está bem, está bem!",4764000,4765416,"Ok, ok, ok!",0.4750
9085,9086,10,1091,4960416,4964666,Não é? Posso arranjar-vos uma estadia num hote...,4960416,4964666,Certo? Posso providenciar para que você se hos...,0.6780
9090,9091,10,1096,4972416,4974416,Tens de nos pôr nesse voo.,4972416,4974416,Você tem que nos colocar naquele voo.,0.6990
9130,9131,10,1136,5103833,5110416,"Olha, eu interpretei mal a situação e... Peço ...",5103833,5110416,"Veja, eu interpretei a situação de forma total...",0.6780


In [16]:
sentence_difference = {}
different_sentence_count = 0

for row in medium_scores_in_db.itertuples():
    pid, shared = calculate_sentence_difference(row)
    sentence_difference[pid] = shared

print("Sentence more different:")
for pid, shared in sentence_difference.items():
    if len(shared) < 2:  # threshold for "more different"
        different_sentence_count += 1
        print(f"Sentence in PT, ", medium_scores_in_db.loc[medium_scores_in_db.pair_id == pid, 'text_pt'].values[0])
        print(f"Sentence in BR, ", medium_scores_in_db.loc[medium_scores_in_db.pair_id == pid, 'text_br'].values[0])

print(f"Total number of super different sentences: {different_sentence_count}")


Sentence more different:
Sentence in PT,  Vá lá!
Sentence in BR,  Vamos!
Sentence in PT,  Fizeram puzzles em crianças, certo?
Sentence in BR,  Montavam quebra-cabeças quando crianças, não?
Sentence in PT,  Não chateies! Vai comprar um fato.
Sentence in BR,  Vá se foder, velho amigo. Consiga um traje.
Sentence in PT,  Ia gostar bastante.
Sentence in BR,  Adoraria isso.
Sentence in PT,  A piada é minha.
Sentence in BR,  Foi uma piada que inventei.
Sentence in PT,  Parece obra do demónio.
Sentence in BR,  Servo do diabo!
Sentence in PT,  Se deixasse crescer a barba?
Sentence in BR,  Já pensou em ficar com barba, senhor?
Sentence in PT,  Este trabalho.
Sentence in BR,  Trabalha aqui.
Sentence in PT,  Sim. Vamos.
Sentence in BR,  Sim. Entre.
Sentence in PT,  E eu reparei-a.
Sentence in BR,  Notei.
Sentence in PT,  Tenho de anotar os progressos meticulosamente.
Sentence in BR,  Preciso anotar minuciosamente seu progresso.
Sentence in PT,  Pode ajudar-me a fazer isso?
Sentence in BR,  Fará is