**//IMPORTS**

In [1]:
import os, gzip, requests, time
import pandas as pd
import pathlib, sys
import re
import matplotlib.pyplot as plt

from collections import Counter
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv         
from pathlib import Path
from opensubtitlescom import OpenSubtitles
from xmlrpc.client import ServerProxy, Error as XMLRPCError

project_root = pathlib.Path().resolve().parents[0] 
print(f"Project root: {project_root}")
sys.path.insert(0, str(project_root / "src"))
from db import connect     
from extract.access_open_subtitles import download_srt
from transform.align_subtitles import eliminate_new_lines, auto_sync_subs, align_subtitles_optimal_hungarian, clean_sub_blocks, strip_tags_str
from load.load_subtitles import load_subtitles
from catalog import collect_movies
from transform.align_subtitles import merge_subtitle_fragments


Project root: /home/rofarate/Thesis
OpenSubtitles: logged-in session OK.


**//CONFIGS**

In [2]:
load_dotenv()

API_BASE = "https://api.opensubtitles.com/api/v1"
API_KEY = os.getenv("OPENSUBTITLES_API_KEY")
USERNAME = os.getenv("OPENSUBTITLES_USER", "")
PASSWORD = os.getenv("OPENSUBTITLES_PASS", "")
USER_AGENT = "MySubtitleApp/1.0"

# REST headers
# REST_HEADERS = {
#     "Api-Key":     API_KEY,
#     "User-Agent":  USER_AGENT,
#     "Content-Type":"application/json"
# }

# HEAD = {
#     "Api-Key": os.getenv("OPENSUBTITLES_API_KEY"),
#     "User-Agent": "TeseCollector/0.1",
#     "Accept": "application/json",
# }

BASE_HEADERS = {
    "Api-Key":     API_KEY,
    "User-Agent":  "MySubtitleApp/1.0",          # <- descriptive!
    "Accept":      "application/json",           # <- important
    "Content-Type":"application/json",
}
AUTH_HEADERS = dict(BASE_HEADERS)       # will gain 'Authorization' below

# XML-RPC client (fallback path)
ost = OpenSubtitles(user_agent=USER_AGENT, api_key=API_KEY)
# optional login for higher quotas
if USERNAME and PASSWORD:
    try:
        ost.login(USERNAME, PASSWORD)
    except Exception:
        pass

# YEARS AND LANGUAGES

YEARS = range(2023, 2024)          
LANGS = {"pt-br", "pt-pt"}             
by_lang = {lang: {} for lang in LANGS}

TIMEOUT = 15

tokenize = lambda text: re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", text.lower())


In [3]:
p = Path("../data/duckdb")
p.mkdir(parents=True, exist_ok=True)

DUCKDB_PATH = Path("../data/duckdb/subs.duckdb")
PROCESSED_DIR = Path("../data/processed")        # <-- where anything derived lands
TIME_WINDOW = 60  

OS_API_KEY = os.getenv("OPENSUBTITLES_API_KEY") 

In [4]:
def td_to_srt(ts) -> str:
    """
    Convert a pandas Timedelta *or* an integer millisecond offset
    to the SRT time-stamp string  HH:MM:SS,mmm
    """
    if isinstance(ts, (int, float)):             # already ms
        total_ms = int(ts)
    else:                                        # Timedelta  → ms
        total_ms = int(ts.total_seconds() * 1000)

    hours,   rem = divmod(total_ms, 3_600_000)
    minutes, rem = divmod(rem,        60_000)
    seconds, ms  = divmod(rem,         1_000)
    return f"{hours:02}:{minutes:02}:{seconds:02},{ms:03}"

def calculate_sentence_difference(row):
    pt_vocab = Counter(tokenize(row.text_pt))
    br_vocab = Counter(tokenize(row.text_br))
    shared   = set(pt_vocab) & set(br_vocab)
    return row.pair_id, shared

**//MAIN CODE**

In [5]:
con = connect()                     # ← runs the DDL, makes subs.duckdb
print(con.execute("PRAGMA show_tables").fetchall())
con.close()

[('movies',), ('subtitle_pairs',)]


In [6]:
total = collect_movies(range(2023, 2024))      # 2023 only
print("movies in DB:", total)


movies in DB: 672


In [7]:
con = connect()    
movies = con.execute("SELECT imdb_id, title FROM movies").fetchall()
len(movies)

672

In [8]:
movies = movies[:100]
print(movies)
print(len(movies))
for imdb_id, title in movies:
    try:
        movie_id = con.execute(
            "SELECT movie_id FROM movies WHERE imdb_id = ?", 
            [imdb_id]
        ).fetchone()[0]

        # 2) skip if we've already stored pairs for this movie
        already = con.execute(
            "SELECT COUNT(*) FROM subtitle_pairs WHERE movie_id = ?", 
            [movie_id]
        ).fetchone()[0]
        if already:
            print(f"→ Skipping {imdb_id} ({title}), already in DB")
            continue
        # ---- download or reuse cache ------------------------------------
        pt_path = download_srt(imdb_id, "pt-PT", title=title)
        br_path = download_srt(imdb_id, "pt-BR", title=title)

        # ---- load & quick clean -----------------------------------------
        subs_pt = load_subtitles(pt_path)
        subs_br = load_subtitles(br_path)
        eliminate_new_lines(subs_pt)
        eliminate_new_lines(subs_br)
        
        clean_sub_blocks(subs_pt)
        clean_sub_blocks(subs_br)
                
        # ---- sentence-level merge (optional but helps alignment) --------
        subs_pt = merge_subtitle_fragments(subs_pt, gap_threshold=pd.Timedelta(seconds=0.2))
        subs_br = merge_subtitle_fragments(subs_br, gap_threshold=pd.Timedelta(seconds=0.2))

        # ---- auto sync --------------------------------------------------
        shifted_pt, final_offset = auto_sync_subs(subs_pt, subs_br)

        # ---- fine alignment --------------------------------------------
        pairs = align_subtitles_optimal_hungarian(shifted_pt, subs_br)

        rows = []
        pair_no = 1
        for pt, br, score in pairs:
            if br is None:
                continue                   # skip unmatched lines

            rows.append({
                "imdb_id": imdb_id,
                "pair_no":  pair_no,
                #"pair_no":  pair_no,
                # store the raw millis             (handy for SQL filtering)
                "start_pt_ms": int(pt["start"].total_seconds() * 1000),
                "end_pt_ms":   int(pt["end"].total_seconds()   * 1000),
                "start_br_ms": int(br["start"].total_seconds() * 1000),
                "end_br_ms":   int(br["end"].total_seconds()   * 1000),
                # and the pretty SRT strings       (for export / debugging)
                # "start_pt_ms": td_to_srt(pt["start"]),
                # "end_pt_ms":   td_to_srt(pt["end"]),
                # "start_br_ms": td_to_srt(br["start"]),
                # "end_br_ms":   td_to_srt(br["end"]),
                "text_pt": pt["text"],
                "text_br": br["text"],
                "score":   float(score),
            })
            pair_no += 1

        pairs_df = pd.DataFrame(rows)
        #con.execute("DELETE FROM subtitle_pairs WHERE imdb_id = ?", [imdb_id])
        con.register("pairs_df", pairs_df)

        con.execute("""
            INSERT INTO subtitle_pairs (
            movie_id,
            pair_no,
            start_pt_ms,
            end_pt_ms,
            text_pt,
            start_br_ms,
            end_br_ms,
            text_br,
            score
            )
            SELECT
            m.movie_id,
            p.pair_no,
            p.start_pt_ms,
            p.end_pt_ms,
            p.text_pt,
            p.start_br_ms,
            p.end_br_ms,
            p.text_br,
            p.score
            FROM pairs_df AS p
            JOIN movies   AS m
            ON p.imdb_id = m.imdb_id
            """)
        
        print(f"✓ Movie {imdb_id} ({title}) in the database")

    except RuntimeError as e:        # e.g. no subtitles, 5xx after retries
        print("✗", imdb_id, title, "→", e)
    except Exception as e:
        print("⚠️  unexpected failure on", imdb_id, ":", e)

[('14230458', 'Poor Things'), ('15764854', 'Fireworks'), ('26449465', 'Peak Season'), ('19883634', 'The Old Oak'), ('27805677', 'Fairlane'), ('20465746', "L'ultima notte di Amore"), ('26671415', 'Seven Veils'), ('22687790', 'A Haunting in Venice'), ('5478456', 'We Are Zombies'), ('18072882', 'Northern Comfort'), ('11767724', 'Skunk'), ('13652142', 'Magazine Dreams'), ('7737800', 'Woman of the Hour'), ('22751422', 'The Accidental Getaway Driver'), ('22375054', 'Strange Darling'), ('21810682', 'The End We Start From'), ('14636170', 'Metalocalypse: Army of the Doomstar'), ('21267296', 'Hell of a Summer'), ('15428134', 'Dunki'), ('27047448', 'Til Death Do Us Part'), ('20358284', 'The Lesson'), ('24486184', 'Big Boys'), ('15038732', 'Chantilly Bridge'), ('10381102', 'Brave the Dark'), ('27722543', 'The Green Border'), ('22041854', 'Priscilla'), ('22023218', "You'll Never Find Me"), ('4589218', "Five Nights at Freddy's"), ('5112584', 'Lee'), ('22890246', 'How to Have Sex'), ('6495056', 'Migr

In [9]:
df_subtitle_pairs = con.execute("SELECT * FROM subtitle_pairs").df()

# df_subtitle_pairs['text_pt'] = df_subtitle_pairs['text_pt'].apply(strip_tags).str.strip()
# df_subtitle_pairs['text_br'] = df_subtitle_pairs['text_br'].apply(strip_tags).str.strip()

print(f"Subtitle pairs in DB: {len(df_subtitle_pairs)}")
low_scores_in_db = df_subtitle_pairs[df_subtitle_pairs.score < 0.4]
medium_scores_in_db = df_subtitle_pairs[(df_subtitle_pairs.score >= 0.4) & (df_subtitle_pairs.score < 0.7)]
high_scores_in_db = df_subtitle_pairs[df_subtitle_pairs.score >= 0.7]


Subtitle pairs in DB: 59023


In [10]:
pt_pt_vocab = Counter(
    token
    for sent in df_subtitle_pairs.text_pt
    for token in tokenize(sent)
)
pt_br_vocab = Counter(
    token
    for sent in df_subtitle_pairs.text_br
    for token in tokenize(sent)
)

shared       = set(pt_pt_vocab) & set(pt_br_vocab)
unique_pt_pt = set(pt_pt_vocab) - set(pt_br_vocab)
unique_pt_br = set(pt_br_vocab) - set(pt_pt_vocab)

print("Vocab sizes:", len(pt_pt_vocab), len(pt_br_vocab))
print("Shared types:",    len(shared))
print("Unique PT-PT types:", len(unique_pt_pt))
print("Unique PT-BR types:", len(unique_pt_br))

# Top-20 distinctive tokens on each side
pt_pt_distinct = sorted(unique_pt_pt, key=lambda t: pt_pt_vocab[t], reverse=True)[:20]
pt_br_distinct = sorted(unique_pt_br, key=lambda t: pt_br_vocab[t], reverse=True)[:20]
print("Top PT-PT distinct:", pt_pt_distinct)
print("Top PT-BR distinct:", pt_br_distinct)


Vocab sizes: 21572 20858
Shared types: 15148
Unique PT-PT types: 6424
Unique PT-BR types: 5710
Top PT-PT distinct: ['sabes', 'vais', 'podes', 'demasiado', 'fizeste', 'sítio', 'disseste', 'miúdos', 'rapariga', 'facto', 'foste', 'fixe', 'bebé', 'estavas', 'fazes', 'devias', 'precisas', 'vês', 'consegues', 'viste']
Top PT-BR distinct: ['garoto', 'alô', 'demônio', 'tô', 'garotas', 'bunda', 'somente', 'fengshen', 'garotos', 'prêmio', 'heia', 'besteira', 'belarus', 'ônibus', 'pedindo', 'policial', 'transar', 'allah', 'su', 'entendendo']


**//GOOD SCORES ANALYSIS**

In [11]:
good_row_count = len(high_scores_in_db)
print(f"Subtitle pairs in DB: {good_row_count}")

high_scores_in_db['text_pt'] = high_scores_in_db['text_pt'].apply(strip_tags_str).str.strip()
high_scores_in_db['text_br'] = high_scores_in_db['text_br'].apply(strip_tags_str).str.strip()

high_scores_in_db


Subtitle pairs in DB: 41230


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_scores_in_db['text_pt'] = high_scores_in_db['text_pt'].apply(strip_tags_str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_scores_in_db['text_br'] = high_scores_in_db['text_br'].apply(strip_tags_str).str.strip()


Unnamed: 0,pair_id,movie_id,pair_no,start_pt_ms,end_pt_ms,text_pt,start_br_ms,end_br_ms,text_br,score
0,1,1,1,980,5938,CONTÉM REPRESENTAÇÕES DE PRODUTOS DE TABACO,1000,6000,HÁ CENAS DE USO DE TABACO,0.712900
1,2,1,2,111772,118272,POBRES CRIATURAS,111800,118100,POBRES CRIATURAS,0.999860
2,3,1,3,286522,288480,Adeus!,286542,288500,Adeus.,0.880900
5,6,1,6,301147,305605,Quem gostaria de reconstruir os órgãos?,301167,305625,"Bem, quem gostaria de recolocar os órgãos?",0.852900
6,7,1,7,305772,310772,"E quem consegue distinguir humano de animal, i...",305792,310792,"E quem pode distinguir o humano do animal, se ...",0.873900
...,...,...,...,...,...,...,...,...,...,...
58418,58419,58,423,2170565,2171941,Eu acho que sim.,2115680,2117360,Eu acho que sim.,0.725575
58421,58422,58,426,2180199,2182076,"O que você bebeu, Sol?",2124600,2126800,"O que você bebeu, Sol?",0.722005
58426,58427,58,431,2199218,2200512,Você já provou sangue?,2142720,2144600,Você já provou sangue?,0.717510
58480,58481,58,485,2344322,2345282,Isso.,2294920,2296720,Isso.,0.752990


**//BAD SCORES ANALYSIS**

In [12]:
bad_row_count = len(low_scores_in_db)
print(f"Low-score pairs in DB: {bad_row_count}")

low_scores_in_db['text_pt'] = low_scores_in_db['text_pt'].apply(strip_tags_str).str.strip()
low_scores_in_db['text_br'] = low_scores_in_db['text_br'].apply(strip_tags_str).str.strip()

low_scores_in_db

Low-score pairs in DB: 1393


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_scores_in_db['text_pt'] = low_scores_in_db['text_pt'].apply(strip_tags_str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_scores_in_db['text_br'] = low_scores_in_db['text_br'].apply(strip_tags_str).str.strip()


Unnamed: 0,pair_id,movie_id,pair_no,start_pt_ms,end_pt_ms,text_pt,start_br_ms,end_br_ms,text_br,score
192,193,1,193,1352272,1353355,E para sempre.,1363667,1367458,ou prisões mal equipadas e poucos funcionários?,0.355025
401,402,1,402,2526522,2527688,Uma pândega!,2526542,2527708,Delícia.,0.369900
1186,1187,1,1187,6586480,6587355,Armou-me uma cilada!,6549125,6552542,Um diabo em um corpo sedutor e insaciável...,0.330225
1206,1207,1,1207,6667938,6670688,FALECIDO em Breve,6641500,6644292,Idiota com cara de boceta.,0.370810
1244,1245,1,1245,6846647,6847522,Cabra!,6835167,6836167,Bam.,0.382600
...,...,...,...,...,...,...,...,...,...,...
59016,59017,58,1021,5041798,5046150,"Ah, sim!",4988160,4990280,"Uau, mamãe!",0.325810
59017,59018,58,1022,5046150,5049993,Para mim,4997640,4999320,"Obrigado, Nuri.",0.302450
59018,59019,58,1023,5049993,5056084,O céu é lindo para mim!,5016800,5019480,O que vou desejar? Um par de botas...,0.393035
59019,59020,58,1024,5056084,5061863,"Ah, sim!",5025800,5027680,Não há desejo.,0.337580


**//MEDIUM SCORES ANALYSIS**

In [13]:
medium_row_count = len(medium_scores_in_db)
print(f"Medium-score pairs in DB: {medium_row_count}")

medium_scores_in_db['text_pt'] = medium_scores_in_db['text_pt'].apply(strip_tags_str).str.strip()
medium_scores_in_db['text_br'] = medium_scores_in_db['text_br'].apply(strip_tags_str).str.strip()

medium_scores_in_db

Medium-score pairs in DB: 16400


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medium_scores_in_db['text_pt'] = medium_scores_in_db['text_pt'].apply(strip_tags_str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medium_scores_in_db['text_br'] = medium_scores_in_db['text_br'].apply(strip_tags_str).str.strip()


Unnamed: 0,pair_id,movie_id,pair_no,start_pt_ms,end_pt_ms,text_pt,start_br_ms,end_br_ms,text_br,score
3,4,1,4,291647,297688,Uma pilha de órgãos sem a centelha de humanida...,291667,295458,Um monte de órgãos sem a centelha de si na men...,0.677900
4,5,1,5,297855,300688,Parece a banca de um talhante para um banquete.,297875,300708,É só um prato do açougueiro para o almoço de d...,0.551900
7,8,1,8,313938,314772,Vá lá!,313958,314792,Vamos!,0.530900
8,9,1,9,315480,318813,"Fizeram puzzles em crianças, certo?",315500,318833,"Montavam quebra-cabeças quando crianças, não?",0.649900
13,14,1,14,333897,336063,A tua proximidade não quer dizer nada.,333917,336083,Sua proximidade física conosco não o inclui nela.,0.698900
...,...,...,...,...,...,...,...,...,...,...
59010,59011,58,1015,5011140,5017395,Pelo meu corpo terreno,5009600,5012600,"Quero igual, mas com sereias.",0.593300
59011,59012,58,1016,5017395,5023016,Enquanto lá em cima no céu,5004640,5007480,"Sim, meu irmão e minhas irmãs estão zombando d...",0.453225
59012,59013,58,1017,5023016,5027916,Eu orarei,5007520,5009560,Super nerd.,0.432520
59021,59022,58,1026,5168771,5171274,A Nuri arrasou.,5164440,5168480,PARA MINHA FILHA,0.460345


In [14]:
sentence_difference = {}
different_sentence_count = 0

for row in medium_scores_in_db.itertuples():
    pid, shared = calculate_sentence_difference(row)
    sentence_difference[pid] = shared

print("Sentence more different:")
for pid, shared in sentence_difference.items():
    if len(shared) < 2:  # threshold for "more different"
        different_sentence_count += 1
        print(f"Sentence in PT, ", medium_scores_in_db.loc[medium_scores_in_db.pair_id == pid, 'text_pt'].values[0])
        print(f"Sentence in BR, ", medium_scores_in_db.loc[medium_scores_in_db.pair_id == pid, 'text_br'].values[0])

print(f"Total number of super different sentences: {different_sentence_count}")


Sentence more different:
Sentence in PT,  Vá lá!
Sentence in BR,  Vamos!
Sentence in PT,  Fizeram puzzles em crianças, certo?
Sentence in BR,  Montavam quebra-cabeças quando crianças, não?
Sentence in PT,  Não chateies! Vai comprar um fato.
Sentence in BR,  Vá se foder, velho amigo. Consiga um traje.
Sentence in PT,  Ia gostar bastante.
Sentence in BR,  Adoraria isso.
Sentence in PT,  A piada é minha.
Sentence in BR,  Foi uma piada que inventei.
Sentence in PT,  Parece obra do demónio.
Sentence in BR,  Servo do diabo!
Sentence in PT,  Se deixasse crescer a barba?
Sentence in BR,  Já pensou em ficar com barba, senhor?
Sentence in PT,  Este trabalho.
Sentence in BR,  Trabalha aqui.
Sentence in PT,  Sim. Vamos.
Sentence in BR,  Sim. Entre.
Sentence in PT,  E eu reparei-a.
Sentence in BR,  Notei.
Sentence in PT,  Tenho de anotar os progressos meticulosamente.
Sentence in BR,  Preciso anotar minuciosamente seu progresso.
Sentence in PT,  Pode ajudar-me a fazer isso?
Sentence in BR,  Fará is

**//ARE THE SENTENCES WELL CONNECTED?**

In [15]:
df_subtitle_pairs.head(-50)

Unnamed: 0,pair_id,movie_id,pair_no,start_pt_ms,end_pt_ms,text_pt,start_br_ms,end_br_ms,text_br,score
0,1,1,1,980,5938,CONTÉM REPRESENTAÇÕES DE PRODUTOS DE TABACO,1000,6000,HÁ CENAS DE USO DE TABACO,0.712900
1,2,1,2,111772,118272,POBRES CRIATURAS,111800,118100,POBRES CRIATURAS,0.999860
2,3,1,3,286522,288480,Adeus!,286542,288500,Adeus.,0.880900
3,4,1,4,291647,297688,Uma pilha de órgãos sem a centelha de humanida...,291667,295458,Um monte de órgãos sem a centelha de si na men...,0.677900
4,5,1,5,297855,300688,Parece a banca de um talhante para um banquete.,297875,300708,É só um prato do açougueiro para o almoço de d...,0.551900
...,...,...,...,...,...,...,...,...,...,...
58968,58969,58,973,4704139,4705225,"Venha, sente-se.",4741840,4743600,"Olha, Ester.",0.510495
58969,58970,58,974,4707935,4710145,"- Você está feliz? - Sim, está legal.",4710480,4712720,Você parece o Jimi Hendrix.,0.595275
58970,58971,58,975,4716068,4718028,Meu inseto favorito.,4716000,4718000,"Cara, isso é ótimo.",0.586660
58971,58972,58,976,4720448,4722533,Você fica bem com o cabelo curto.,4730440,4731920,"Calma, rapazes.",0.425040


In [16]:
import pandas as pd
import re

# 2) helper to decide merge on two fragments
def would_merge(a_text, b_text, a_end_ms, b_start_ms, gap_ms=120):
    a = strip_tags_str(a_text); b = strip_tags_str(b_text)
    gap = b_start_ms - a_end_ms
    if gap > gap_ms:
        return False
    tail = a.rstrip()[-1:]; head = b.lstrip()[:1]
    return (tail == "," or tail not in ".?!") and head.islower()

# 3) prepare DataFrame
df = df_subtitle_pairs.sort_values(['movie_id','pair_no']).reset_index(drop=True)

merge_count = 0
groups = []
current_group = [0]

# 4) build dual-language groups
for i in range(1, len(df)):
    prev, curr = df.iloc[i-1], df.iloc[i]
    same_movie = (prev.movie_id == curr.movie_id)
    pt_ok = would_merge(prev.text_pt, curr.text_pt, prev.end_pt_ms, curr.start_pt_ms)
    br_ok = would_merge(prev.text_br, curr.text_br, prev.end_br_ms, curr.start_br_ms)
    
    if same_movie and pt_ok and br_ok:
        current_group.append(i)
    else:
        groups.append(current_group)
        current_group = [i]
groups.append(current_group)

# 5) inspect & count
for grp in groups:
    if len(grp) > 1:
        merge_count += 1
        print(f"\n--- Merge group #{merge_count} (rows {grp}) ---")
        print("Original PT-PT sentences:")
        for idx in grp:
            row = df.iloc[idx]
            print(f"  • [{row.movie_id},{row.pair_no}]: {strip_tags_str(row.text_pt)}")
        print("Original PT-BR sentences:")
        for idx in grp:
            row = df.iloc[idx]
            print(f"  • [{row.movie_id},{row.pair_no}]: {strip_tags_str(row.text_br)}")

        # build the merged texts
        merged_pt = strip_tags_str(df.iloc[grp[0]].text_pt)
        merged_br = strip_tags_str(df.iloc[grp[0]].text_br)
        for idx in grp[1:]:
            txt_pt = strip_tags_str(df.iloc[idx].text_pt)
            txt_br = strip_tags_str(df.iloc[idx].text_br)
            # PT merge
            if merged_pt.rstrip().endswith(","):
                merged_pt += " " + txt_pt
            else:
                merged_pt = merged_pt.rstrip() + " " + txt_pt
            # BR merge (same logic)
            if merged_br.rstrip().endswith(","):
                merged_br += " " + txt_br
            else:
                merged_br = merged_br.rstrip() + " " + txt_br

        print("Merged PT-PT version:")
        print(f"  ▶ {merged_pt}")
        print("Merged PT-BR version:")
        print(f"  ▶ {merged_br}")

print(f"\nTotal dual-language merge events: {merge_count}")



--- Merge group #1 (rows [14138, 14139]) ---
Original PT-PT sentences:
  • [15,5]: o assassino em série norte-americano
  • [15,6]: mais prolífico e singular do século XXI
Original PT-BR sentences:
  • [15,5]: o serial killer americano mais prolífico e único
  • [15,6]: do século 21
Merged PT-PT version:
  ▶ o assassino em série norte-americano mais prolífico e singular do século XXI
Merged PT-BR version:
  ▶ o serial killer americano mais prolífico e único do século 21

--- Merge group #2 (rows [14140, 14141, 14142, 14143, 14144]) ---
Original PT-PT sentences:
  • [15,7]: orquestrou uma onda de homicídios multiestatal
  • [15,8]: que começou em Denver, Colorado,
  • [15,9]: continuou por Grand Lake,
  • [15,10]: expandiu-se pelo Wyoming e pelo centro de Idaho,
  • [15,11]: e terminou na floresta densa do condado de Hood River,
Original PT-BR sentences:
  • [15,7]: orquestrou um massacre multiestadual
  • [15,8]: que começou em Denver, Colorado,
  • [15,9]: continuou em Grand Lake,
  

In [20]:
import numpy as np

# random indices (excluding the last one so i+1 exists)
idxs = np.random.choice(df_subtitle_pairs.index[:-1], size=500, replace=False)

for i in idxs:
    a = df_subtitle_pairs.loc[i]
    b = df_subtitle_pairs.loc[i+1]
    print(f"Checking rows {i} → {i+1}")
    print(" A PT-PT:", strip_tags_str(a.text_pt).strip())
    print(" B PT-PT:", strip_tags_str(b.text_pt).strip())
    print(" would_merge:", would_merge(a.text_pt, b.text_pt, a.end_pt_ms, b.start_pt_ms))
    print("-" * 50)


Checking rows 52850 → 52851
 A PT-PT: Não te queria magoar.
 B PT-PT: Então... vou esperar um pouco e depois vou.
 would_merge: False
--------------------------------------------------
Checking rows 6758 → 6759
 A PT-PT: Bonsoir, madame, devo dizer que esperava uma pessoa mais...
 B PT-PT: Dramática? Ridícula?
 would_merge: False
--------------------------------------------------
Checking rows 46047 → 46048
 A PT-PT: - Sim? - É espetacular.
 B PT-PT: O que é que te posso dar?
 would_merge: False
--------------------------------------------------
Checking rows 18081 → 18082
 A PT-PT: Agora, vivemos no anexo.
 B PT-PT: Entre.
 would_merge: False
--------------------------------------------------
Checking rows 862 → 863
 A PT-PT: Estou sempre à espera que melhore.
 B PT-PT: Claro que estás.
 would_merge: False
--------------------------------------------------
Checking rows 50274 → 50275
 A PT-PT: É tão impossível como qualquer outro milagre."
 B PT-PT: A ÚNICA RAPARIGA DA ORQUESTRA
 woul

In [24]:
len(df_subtitle_pairs)-bad_row_count

57630