In [26]:
def approx_match(time_a, time_b, slippage=0.040000000000000036):
    return abs(time_a - time_b) <= slippage

In [28]:
SAMPLE = "2442204240010034621"

In [34]:
from pathlib import Path
TEXT = Path("/Users/joregan/Playing/kbw2v")
PHONES = Path("/Users/joregan/Playing/rd_phonetic")

In [35]:
text = open(TEXT / f"{SAMPLE}_480p.json")
phones = open(PHONES / f"{SAMPLE}_480p.json")

In [36]:
import json
text_json = json.load(text)
phone_json = json.load(phones)

In [121]:
class Chunk:
    def __init__(self, chunk):
        self.text = chunk['text']
        self.start = chunk['timestamp'][0]
        self.end = chunk['timestamp'][1]
    
    def __repr__(self) -> str:
        return f"[{self.text} ({self.start}, {self.end})]"

class SimpleMerge(Chunk):
    def __init__(self, left: Chunk, right: Chunk):
        self.text = left.text
        self.phone = right.text
        self.start = left.start
        self.end = left.end
        self.diff_start = left.start - right.start
        self.diff_end = left.end - right.end

    def exact_length(self):
        return self.diff_start == 0 and self.diff_end == 0

    def __repr__(self) -> str:
        return f"[{self.text} :: {self.phone} ({self.start}, {self.end})]"

class ComplexMerge(Chunk):
    def __init__(self, left, right):
        if type(left) == list:
            self.left_chunks = left
        elif left == None:
            self.left_chunks == []
        else:
            self.left_chunks = [left]
        if type(right) == list:
            self.right_chunks = right
        elif right == None:
            self.right_chunks == []
        else:
            self.right_chunks = [right]

        self.start = self.get_start()
        self.end = self.get_end()

        self.text = " ".join([x.text for x in self.left_chunks])
        self.phone = " ".join([x.text for x in self.right_chunks])

    def get_start(self):
        if self.left_chunks == [] and self.right_chunks == []:
            return None
        if self.left_chunks == []:
            return self.right_chunks[0].start
        if self.right_chunks == []:
            return self.left_chunks[0].start
        if self.left_chunks[0].start < self.right_chunks[0].start:
            return self.left_chunks[0].start
        else:
            return self.right_chunks[0].start

    def get_end(self):
        if self.left_chunks == [] and self.right_chunks == []:
            return None
        if self.left_chunks == []:
            return self.right_chunks[-1].end
        if self.right_chunks == []:
            return self.left_chunks[-1].end
        if self.left_chunks[-1].end > self.right_chunks[-1].end:
            return self.left_chunks[-1].end
        else:
            return self.right_chunks[-1].end

    def __repr__(self) -> str:
        return f"[{self.text} :: {self.phone} ({self.start}, {self.end})]"

        

In [82]:
text_chunks = []
phone_chunks = []
for text_chunk in text_json['chunks']:
    text_chunks.append(Chunk(text_chunk))

for phone_chunk in phone_json['chunks']:
    phone_chunks.append(Chunk(phone_chunk))

In [127]:
tci = 0
pci = 0

merged = []

def idx_safe(tci, pci):
    return tci < len(text_chunks) and pci < len(phone_chunks)

while idx_safe(tci, pci):
    am_start = approx_match(text_chunks[tci].start, phone_chunks[pci].start)
    am_end = approx_match(text_chunks[tci].end, phone_chunks[pci].end)
    
    if am_start and am_end:
        merged.append(SimpleMerge(text_chunks[tci], phone_chunks[pci]))
    elif am_start:
        save_pci = pci
        peek = (pci + 1) < len(phone_chunks)
        lt_next = peek and (phone_chunks[pci + 1].start < text_chunks[tci].end)
        while idx_safe(tci, pci) and not approx_match(text_chunks[tci].start, phone_chunks[pci].start) and lt_next:
            pci += 1
            print(text_chunks[tci], phone_chunks[pci])
        merged.append(ComplexMerge(text_chunks[tci], phone_chunks[save_pci:pci]))
    elif am_end:
        save_tci = tci
        peek = (tci + 1) < len(text_chunks)
        lt_next = peek and (text_chunks[tci + 1].start < phone_chunks[pci].end)
        while idx_safe(tci, pci) and not approx_match(text_chunks[tci].start, phone_chunks[pci].start) and lt_next:
            tci += 1
            print(text_chunks[tci], phone_chunks[pci])
        merged.append(ComplexMerge(text_chunks[save_tci:tci], phone_chunks[pci]))
    else:
        print(text_chunks[tci], phone_chunks[pci])
    tci += 1
    pci += 1
print(tci, pci)

[DEBATT (0.66, 1.08)] [seks (0.12, 0.46)]
[MED (1.18, 1.24)] [debat (0.66, 1.12)]
[ANLEDNING (1.38, 1.92)] [meː (1.16, 1.22)]
[AV (2.04, 2.18)] [anleːdnɪŋ (1.36, 1.92)]
[INTEPELATIONSSVAR (2.38, 3.34)] [ɑːv (2.04, 2.22)]
[OCH (3.84, 3.92)] [ɪntepelaɧuːnsvɑːr (2.38, 3.34)]
[KAMMAREN (3.96, 4.32)] [oː (3.86, 3.88)]
[ÖVERGÅR (4.44, 4.78)] [kamarən (3.96, 4.34)]
[NU (4.86, 4.9)] [øːvərɡoːr (4.42, 4.8)]
[TILL (4.98, 5.12)] [nʉː (4.86, 4.9)]
[ATT (5.16, 5.26)] [tɪl (5.04, 5.12)]
[DEBATTERA (5.32, 5.78)] [at (5.14, 5.24)]
[STATSRÅDENS (5.88, 6.5)] [debateːra (5.32, 5.8)]
[SVAR (6.56, 6.84)] [stasroːdən (5.86, 6.44)]
[PÅ (6.94, 7.0)] [svɑːr (6.54, 6.86)]
[MÖTENAS (7.1, 7.62)] [poː (6.96, 7.02)]
[INTERPELATIONER (8.34, 9.16)] [møːtæɳa (7.08, 7.48)]
[OCH (9.66, 9.72)] [s (7.58, 7.62)]
[VI (9.76, 9.8)] [ɪntepelaɧuːnər (8.34, 9.18)]
[STARTAR (9.88, 10.2)] [<pa> (9.48, 9.5)]
[MED (10.24, 10.3)] [oː (9.68, 9.7)]
[ARBETSMARKNADSOCH (10.42, 11.74)] [viː (9.76, 9.8)]
[JÄMSTÄLLDHETSMINISTER (12.0, 13.2)

In [123]:
len(merged)

1

In [62]:
class myfloat(float):
    def __eq__(self, other):
        return approx_match(self.real, other.real)
    def __hash__(self):
        return hash(self.real)

starts_a = [myfloat(x.start) for x in text_chunks]
starts_b = [myfloat(x.start) for x in phone_chunks]

In [63]:
from difflib import SequenceMatcher

In [64]:
sm = SequenceMatcher(a=starts_a, b=starts_b)

In [None]:
for op, a_start, a_end, b_start, b_end in sm.get_opcodes():
    print(op, a_start, a_end, b_start, b_end)