**//IMPORTS**

In [1]:
import sys, pathlib
import pandas as pd
import numpy as np
import pysrt, os
import random

from pathlib import Path
from datetime import timedelta

project_root = pathlib.Path().resolve().parents[0] 
print(f"Project root: {project_root}")
sys.path.insert(0, str(project_root / "src"))
from transform.align_subtitles import (eliminate_new_lines, merge_subtitle_fragments, align_subtitles_optimal_hungarian, find_offset_between_subtitles, find_offset_between_subtitles_percentile, auto_sync_subs)
from load.load_subtitles import load_subtitles

Project root: /home/rofarate/Thesis




**//CONFIGS**

In [13]:
OUTPUT_DIR_BR = Path("../data/raw/test_br_subs")
OUTPUT_DIR_PT = Path("../data/raw/test_pt_subs")
os.makedirs(OUTPUT_DIR_BR, exist_ok=True)
os.makedirs(OUTPUT_DIR_PT, exist_ok=True)


**//FUNCTION** Functions to be put in the transform folder

**//FUNCTIONS** Preprocessing

**//MAIN CODE**

In [14]:
file_a = '../data/raw/test_br_subs/14513804.srt'
file_b = '../data/raw/test_pt_subs/14513804.srt'  

In [15]:
subs_a = load_subtitles(file_a)
subs_b = load_subtitles(file_b)

eliminate_new_lines(subs_a)
eliminate_new_lines(subs_b)

merged_subs_a = merge_subtitle_fragments(subs_a, gap_threshold=pd.Timedelta(seconds=0.2))
merged_subs_b = merge_subtitle_fragments(subs_b, gap_threshold=pd.Timedelta(seconds=0.2))

In [16]:
aligned_pairs = align_subtitles_optimal_hungarian(subs_a, subs_b)
aligned_pairs_preprocessed = align_subtitles_optimal_hungarian(merged_subs_a, merged_subs_b)

In [23]:
aligned_pairs

[({'start': Timedelta('0 days 00:00:52.580000'),
   'end': Timedelta('0 days 00:00:54.560000'),
   'text': 'Senhor, um aviso de um minuto.'},
  {'start': Timedelta('0 days 00:00:31.990000'),
   'end': Timedelta('0 days 00:00:33.908000'),
   'text': 'Sr. Presidente, falta um minuto.'},
  np.float64(0.58205)),
 ({'start': Timedelta('0 days 00:00:54.580000'),
   'end': Timedelta('0 days 00:00:56.400000'),
   'text': 'Parabéns novamente.'},
  {'start': Timedelta('0 days 00:00:33.909000'),
   'end': Timedelta('0 days 00:00:35.284000'),
   'text': 'Parabéns, mais uma vez.'},
  np.float64(0.595645)),
 ({'start': Timedelta('0 days 00:00:56.430000'),
   'end': Timedelta('0 days 00:00:58.560000'),
   'text': 'Ouviu algo da Betty, da minha filha?'},
  {'start': Timedelta('0 days 00:00:35.285000'),
   'end': Timedelta('0 days 00:00:37.912000'),
   'text': 'Soube alguma coisa da minha filha Betty?'},
  np.float64(0.621275)),
 ({'start': Timedelta('0 days 00:00:58.580000'),
   'end': Timedelta('0 da

In [17]:
# aligned_pairs_greedy = align_subtitles_greedy(subs_a, subs_b, time_window=time_window)

In [18]:
count = 0
for _ in aligned_pairs:
    a, b, score = _
    if score == 0:
        count += 1
        print(_)
print(f"Total unmatched pairs: {count}")
print(f"Total pairs: {len(aligned_pairs)}")

({'start': Timedelta('0 days 00:01:07.420000'), 'end': Timedelta('0 days 00:01:09.430000'), 'text': 'estamos a instantes de começar.'}, None, 0)
({'start': Timedelta('0 days 00:01:09.450000'), 'end': Timedelta('0 days 00:01:11.560000'), 'text': 'Vocês estão prontos?'}, None, 0)
({'start': Timedelta('0 days 00:01:21.350000'), 'end': Timedelta('0 days 00:01:22.530000'), 'text': 'Senhoras e senhores,'}, None, 0)
({'start': Timedelta('0 days 00:02:27.390000'), 'end': Timedelta('0 days 00:02:28.560000'), 'text': 'abraçou o tema da união'}, None, 0)
({'start': Timedelta('0 days 00:02:38.470000'), 'end': Timedelta('0 days 00:02:40.460000'), 'text': 'alguns questionam se eles podem superar'}, None, 0)
({'start': Timedelta('0 days 00:02:44.550000'), 'end': Timedelta('0 days 00:02:45.530000'), 'text': 'Aqui está, Cap.'}, None, 0)
({'start': Timedelta('0 days 00:02:45.550000'), 'end': Timedelta('0 days 00:02:47.540000'), 'text': 'Durante seu tempo como um...'}, None, 0)
({'start': Timedelta('0 da

In [19]:
count = 0
for _ in aligned_pairs_preprocessed:
    a, b, score = _
    if score == 0:
        count += 1
        print(_)
print(f"Total unmatched pairs: {count}")
print(f"Total pairs: {len(aligned_pairs)}")

({'start': Timedelta('0 days 00:01:17.560000'), 'end': Timedelta('0 days 00:01:20.580000'), 'text': 'Prontos para Ross! Prontos para Ross!'}, None, 0)
({'start': Timedelta('0 days 00:05:56.390000'), 'end': Timedelta('0 days 00:05:57.520000'), 'text': 'Ah.'}, None, 0)
({'start': Timedelta('0 days 00:06:37.570000'), 'end': Timedelta('0 days 00:06:38.550000'), 'text': 'Silêncio!'}, None, 0)
({'start': Timedelta('0 days 00:06:42.530000'), 'end': Timedelta('0 days 00:06:45.360000'), 'text': 'Vámonos, vámonos.'}, None, 0)
({'start': Timedelta('0 days 00:08:42.480000'), 'end': Timedelta('0 days 00:08:44.400000'), 'text': 'Isso não é um problema.'}, None, 0)
({'start': Timedelta('0 days 00:09:24.450000'), 'end': Timedelta('0 days 00:09:25.480000'), 'text': 'Nem tanto.'}, None, 0)
({'start': Timedelta('0 days 00:10:01.540000'), 'end': Timedelta('0 days 00:10:03.520000'), 'text': 'Uhul!'}, None, 0)
({'start': Timedelta('0 days 00:11:40.390000'), 'end': Timedelta('0 days 00:11:41.350000'), 'text'

In [20]:
avg_offset, max_offset, coverage, mean_score = find_offset_between_subtitles(aligned_pairs)
print(f"Average Offset: {avg_offset:.3f} s")
print(f"Maximum Offset: {max_offset:.3f} s")
print(f"Mean Score: {mean_score:.3f}")

avg_offset, max_offset, coverage, mean_score = find_offset_between_subtitles_percentile(aligned_pairs)
print(f"Average Offset With Most Similarity: {avg_offset:.3f} s")
print(f"Maximum Offset With Most Similarity: {max_offset:.3f} s")
print(f"Mean Score With Most Similarity: {mean_score:.3f}")

Average Offset: 17.706 s
Maximum Offset: 27.736 s
Mean Score: 0.757
Average Offset With Most Similarity: 17.987 s
Maximum Offset With Most Similarity: 21.256 s
Mean Score With Most Similarity: 0.910


In [21]:
# test = [
#     {"start": pd.to_timedelta("00:00:10"), "end": pd.to_timedelta("00:00:12"), "text": "Hello"},
#     {"start": pd.to_timedelta("00:00:13"), "end": pd.to_timedelta("00:00:15"), "text": "World"},
# ]

# shifted = shift_srt(test, 2.0)  # shift by 2 seconds

# for original, new in zip(test, shifted):
#     print(f"Original: {original['start']} → Shifted: {new['start']}")


**//OFFSET ALGORITHM**

In [22]:
auto_sync_subs(merged_subs_a, merged_subs_b)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/rofarate/Thesis/thesis/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3670, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_3785/1837873584.py", line 1, in <module>
    auto_sync_subs(merged_subs_a, merged_subs_b)
  File "/home/rofarate/Thesis/src/transform/align_subtitles.py", line 276, in auto_sync_subs
    aligned = aligner(subs_a, subs_b)
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/rofarate/Thesis/src/transform/align_subtitles.py", line None, in align_subtitles_optimal_hungarian
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/rofarate/Thesis/thesis/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 2176, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/rofarate/Thesis/thesis/l

**//METRIC AND EXAMPLES**

In [None]:
number_of_movies = 0
total_lines = 0

for m in OUTPUT_DIR_PT.glob("*.srt"):
    number_of_movies += 1
    sub = load_subtitles(m)
    number_of_lines = len(sub)
    total_lines += number_of_lines

print(f"Total number of movies: {number_of_movies}")
print(f"Total number of subtitle lines: {total_lines}")

Total number of movies: 22
Total number of subtitle lines: 28441


In [None]:
subs_1 = random.choice(list(OUTPUT_DIR_PT.glob("*.srt")))
print(f"Randomly selected subtitle file: {subs_1}")
corresponding_subs = OUTPUT_DIR_BR / subs_1.name
print(f"Corresponding subtitle file: {corresponding_subs}")

Randomly selected subtitle file: ../data/raw/test_pt_subs/24082438.srt
Corresponding subtitle file: ../data/raw/test_br_subs/24082438.srt
