**//IMPORTS**

In [1]:
import sys, pathlib
import pandas as pd
import numpy as np
import pysrt

from typing import List, Dict, Sequence, Tuple
from datetime import timedelta

project_root = pathlib.Path().resolve().parents[0] 
print(f"Project root: {project_root}")
sys.path.insert(0, str(project_root / "src"))
from transform.align_subtitles import time_diff_score, align_subtitles_greedy, align_subtitles_optimal_hungarian, shift_srt

Project root: /home/rofarate/Thesis




**//FUNCTION** Functions to be put in the transform folder

In [2]:
def find_offset_between_subtitles(aligned_pairs, min_score=0.6):
    time_differences = []
    scores = []
    matched = 0
    
    for a, b, score in aligned_pairs:
        if b is not None and score > min_score:
            matched += 1
            time_differences.append((a['start'] - b['start']).total_seconds())
            scores.append(score)

    if len(time_differences) == 0:
        return 0.0, 0.0
    
    total = len(aligned_pairs)
    coverage = matched / total if total > 0 else 0.0
    avg_offset = np.mean(time_differences) if time_differences else 0.0
    max_offset = np.max(np.abs(time_differences)) if time_differences else 0.0
    mean_score = np.mean(scores) if scores else 0.0

    return avg_offset, max_offset, coverage, mean_score

def find_offset_between_subtitles_percentile(
    aligned_pairs: Sequence[Tuple[Dict[str, object], Dict[str, object], float]],
    top_percent: float = 10.0,          # keep the top-10 % most similar
    min_pairs: int = 3                  # need at least this many to compute stats
) -> Tuple[float, float, float, float]:

    diffs, scores = [], []
    for a, b, s in aligned_pairs:
        if b is not None:
            diffs.append((a["start"] - b["start"]).total_seconds())
            scores.append(float(s))

    total = len(scores)
    if total == 0:
        return 0.0, 0.0, 0.0, 0.0

    # percentile threshold
    q = 100.0 - top_percent            # e.g. 90-th percentile for top 10 %
    score_cutoff = np.percentile(scores, q)

    # keep only pairs at / above the cutoff
    kept_diffs  = [d for d, s in zip(diffs, scores) if s >= score_cutoff]
    kept_scores = [s for s       in scores            if s >= score_cutoff]
    kept_diffs = np.array(kept_diffs)

    if len(kept_diffs) < min_pairs:    
        print(f"Warning: not enough pairs ({len(kept_diffs)})")
        return 0.0, 0.0, len(kept_diffs) / total, (np.mean(kept_scores) if kept_scores else 0.0)

    avg_offset = float(np.median(kept_diffs))        # median, not mean
    max_offset = float(np.max(np.abs(kept_diffs)))
    coverage   = len(kept_diffs) / total
    mean_score = float(np.mean(kept_scores))

    return avg_offset, max_offset, coverage, mean_score

def shift_srt(subs, offset_seconds):
    delta = pd.to_timedelta(offset_seconds, unit='s')
    shifted = []
    for record in subs:
        shifted.append(
            {
                "start": record["start"] - delta,   # subtract the signed Δ
                "end":   record["end"]   - delta,
                "text":  record["text"],
            }
        )
    return shifted

**//FUNCTIONS** Preprocessing

In [3]:
def eliminate_new_lines(subtitles):
    for sub in subtitles:
        if '\n' in sub['text']:
            sub['text'] = sub['text'].replace('\n', ' ')

def merge_subtitle_fragments(
    blocks: Sequence[Dict[str, object]],
    gap_threshold: timedelta | pd.Timedelta = pd.Timedelta(milliseconds=120),
    join_punct: str = " ",                     # what to insert between merged parts
    terminal_stop: str = ".?!",                # punctuation that *prevents* merging
) -> List[Dict[str, object]]:

    gap_threshold = pd.Timedelta(gap_threshold)

    merged: List[Dict[str, object]] = []
    i = 0
    n = len(blocks)

    while i < n:
        current = blocks[i].copy()           # shallow copy is enough
        i += 1                           # provisional advance

        # attempt to merge with followers as long as the rule allows
        while i < n:
            nxt = blocks[i]
            gap = nxt["start"] - current["end"]

            # last char of current text, first char of next
            tail = current["text"].rstrip()[-1:]          # could be '', ',' …
            head = nxt["text"].lstrip()[:1]

            mergeable = (
                gap <= gap_threshold and
                (tail == "," or tail not in terminal_stop) and
                head.islower()
            )
            if not mergeable:
                break

            # --- perform merge --------------------------------------------
            if tail == ",":
                current["text"] = current["text"].rstrip(",") + " " + nxt["text"].lstrip()
            else:
                current["text"] = current["text"].rstrip() + join_punct + nxt["text"].lstrip()

            current["end"] = nxt["end"]      # extend duration
            i += 1                       # consume the merged‐in block

        merged.append(current)

    return merged

def load_subtitles(path):
    subs = pysrt.open(path)
    return [
        {
            "start": pd.to_timedelta(s.start.ordinal, unit="ms"),  # milliseconds → Timedelta
            "end":   pd.to_timedelta(s.end.ordinal,   unit="ms"),
            "text":  s.text.replace("\n", " ").strip(),
        }
        for s in subs
    ]

**//MAIN CODE**

In [4]:
file_a = '../data/raw/test_br_subs/tt14513804.srt'
file_b = '../data/raw/test_pt_subs/tt14513804.srt'  

In [5]:
subs_a = load_subtitles(file_a)
subs_b = load_subtitles(file_b)

eliminate_new_lines(subs_a)
eliminate_new_lines(subs_b)

merged_subs_a = merge_subtitle_fragments(subs_a, gap_threshold=pd.Timedelta(seconds=0.2))
merged_subs_b = merge_subtitle_fragments(subs_b, gap_threshold=pd.Timedelta(seconds=0.2))

In [6]:
aligned_pairs = align_subtitles_optimal_hungarian(subs_a, subs_b)
aligned_pairs_preprocessed = align_subtitles_optimal_hungarian(merged_subs_a, merged_subs_b)

In [7]:
# aligned_pairs_greedy = align_subtitles_greedy(subs_a, subs_b, time_window=time_window)

In [8]:
count = 0
for _ in aligned_pairs:
    a, b, score = _
    if score == 0:
        count += 1
        print(_)
print(f"Total unmatched pairs: {count}")
print(f"Total pairs: {len(aligned_pairs)}")

({'start': Timedelta('0 days 00:01:07.420000'), 'end': Timedelta('0 days 00:01:09.430000'), 'text': 'estamos a instantes de começar.'}, None, 0)
({'start': Timedelta('0 days 00:01:09.450000'), 'end': Timedelta('0 days 00:01:11.560000'), 'text': 'Vocês estão prontos?'}, None, 0)
({'start': Timedelta('0 days 00:01:21.350000'), 'end': Timedelta('0 days 00:01:22.530000'), 'text': 'Senhoras e senhores,'}, None, 0)
({'start': Timedelta('0 days 00:02:27.390000'), 'end': Timedelta('0 days 00:02:28.560000'), 'text': 'abraçou o tema da união'}, None, 0)
({'start': Timedelta('0 days 00:02:38.470000'), 'end': Timedelta('0 days 00:02:40.460000'), 'text': 'alguns questionam se eles podem superar'}, None, 0)
({'start': Timedelta('0 days 00:02:44.550000'), 'end': Timedelta('0 days 00:02:45.530000'), 'text': 'Aqui está, Cap.'}, None, 0)
({'start': Timedelta('0 days 00:02:45.550000'), 'end': Timedelta('0 days 00:02:47.540000'), 'text': 'Durante seu tempo como um...'}, None, 0)
({'start': Timedelta('0 da

In [9]:
count = 0
for _ in aligned_pairs_preprocessed:
    a, b, score = _
    if score == 0:
        count += 1
        print(_)
print(f"Total unmatched pairs: {count}")
print(f"Total pairs: {len(aligned_pairs)}")

({'start': Timedelta('0 days 00:01:17.560000'), 'end': Timedelta('0 days 00:01:20.580000'), 'text': 'Prontos para Ross! Prontos para Ross!'}, None, 0)
({'start': Timedelta('0 days 00:05:56.390000'), 'end': Timedelta('0 days 00:05:57.520000'), 'text': 'Ah.'}, None, 0)
({'start': Timedelta('0 days 00:06:37.570000'), 'end': Timedelta('0 days 00:06:38.550000'), 'text': 'Silêncio!'}, None, 0)
({'start': Timedelta('0 days 00:06:42.530000'), 'end': Timedelta('0 days 00:06:45.360000'), 'text': 'Vámonos, vámonos.'}, None, 0)
({'start': Timedelta('0 days 00:08:42.480000'), 'end': Timedelta('0 days 00:08:44.400000'), 'text': 'Isso não é um problema.'}, None, 0)
({'start': Timedelta('0 days 00:09:24.450000'), 'end': Timedelta('0 days 00:09:25.480000'), 'text': 'Nem tanto.'}, None, 0)
({'start': Timedelta('0 days 00:10:01.540000'), 'end': Timedelta('0 days 00:10:03.520000'), 'text': 'Uhul!'}, None, 0)
({'start': Timedelta('0 days 00:11:40.390000'), 'end': Timedelta('0 days 00:11:41.350000'), 'text'

In [10]:
avg_offset, max_offset, coverage, mean_score = find_offset_between_subtitles(aligned_pairs)
print(f"Average Offset: {avg_offset:.3f} s")
print(f"Maximum Offset: {max_offset:.3f} s")
print(f"Mean Score: {mean_score:.3f}")

avg_offset, max_offset, coverage, mean_score = find_offset_between_subtitles_percentile(aligned_pairs)
print(f"Average Offset With Most Similarity: {avg_offset:.3f} s")
print(f"Maximum Offset With Most Similarity: {max_offset:.3f} s")
print(f"Mean Score With Most Similarity: {mean_score:.3f}")

Average Offset: 17.706 s
Maximum Offset: 27.736 s
Mean Score: 0.757
Average Offset With Most Similarity: 17.987 s
Maximum Offset With Most Similarity: 21.256 s
Mean Score With Most Similarity: 0.910


In [11]:
# test = [
#     {"start": pd.to_timedelta("00:00:10"), "end": pd.to_timedelta("00:00:12"), "text": "Hello"},
#     {"start": pd.to_timedelta("00:00:13"), "end": pd.to_timedelta("00:00:15"), "text": "World"},
# ]

# shifted = shift_srt(test, 2.0)  # shift by 2 seconds

# for original, new in zip(test, shifted):
#     print(f"Original: {original['start']} → Shifted: {new['start']}")


In [12]:
EPS_COVERAGE   = 0.0025   # 0.25% coverage improvement threshold
EPS_MEAN_SCORE = 0.01     # 1% mean score improvement threshold
EPS_OFFSET     = 0.05     # 50 ms acceptable offset

plateau_rounds = 0
MAX_PLATEAU_ROUNDS = 2

prev_coverage = 0.0
prev_mean_score = 0.0
prev_avg_offset = float('inf')
prev_max_offset = float('inf')

while True:
    aligned_pairs = align_subtitles_optimal_hungarian(merged_subs_a, merged_subs_b)
    avg_offset, max_offset, coverage, mean_score = find_offset_between_subtitles_percentile(aligned_pairs)

    print(f"Offset: {avg_offset:.3f}s (max: {max_offset:.3f}s)  Coverage: {coverage:.3f}  Mean Score: {mean_score:.3f}")

    # Break if offsets are acceptable
    if abs(avg_offset) < EPS_OFFSET and max_offset < 2*EPS_OFFSET:
        break

    # Apply shift based on average offset (could also try using max_offset)
    merged_subs_a = shift_srt(merged_subs_a, avg_offset)

    # Check for improvement
    coverage_improvement = coverage - prev_coverage
    score_improvement    = mean_score - prev_mean_score
    avg_offset_improvement = prev_avg_offset - abs(avg_offset)  
    max_offset_improvement = prev_max_offset - max_offset  

    if (coverage_improvement < EPS_COVERAGE and
        score_improvement < EPS_MEAN_SCORE and
        avg_offset_improvement < EPS_OFFSET and
        max_offset_improvement < EPS_OFFSET):
        plateau_rounds += 1
        print(f"Plateau detected ({plateau_rounds})")
        if plateau_rounds >= MAX_PLATEAU_ROUNDS:
            print("No further improvement. Stopping.")
            break
    else:
        plateau_rounds = 0  # reset if improvement detected

    prev_coverage = coverage
    prev_mean_score = mean_score


Offset: 17.975s (max: 21.256s)  Coverage: 0.100  Mean Score: 0.910
Offset: 0.075s (max: 4.303s)  Coverage: 0.100  Mean Score: 0.992
Offset: -0.000s (max: 4.378s)  Coverage: 0.100  Mean Score: 0.992
Offset: -0.000s (max: 4.378s)  Coverage: 0.100  Mean Score: 0.992
Offset: -0.000s (max: 4.378s)  Coverage: 0.100  Mean Score: 0.992
Offset: -0.000s (max: 4.378s)  Coverage: 0.100  Mean Score: 0.992
Offset: -0.000s (max: 4.378s)  Coverage: 0.100  Mean Score: 0.992
Offset: -0.000s (max: 4.378s)  Coverage: 0.100  Mean Score: 0.992
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/rofarate/Thesis/thesis/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3670, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_14601/950733643.py", line 14, in <module>
    aligned_pairs = align_subtitles_optimal_hungarian(merged_subs_a, merged_subs_b)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/rofarate/Thesis/src/transform/align_subtitles.py", line None, in align_subtitles_optimal_hungarian
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/rofarate/Thesis/thesis/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 2176, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/rofarate/Thesis/thesis/lib/python3.11/site-packages/IPython/core/ultratb.