In [26]:
def approx_match(time_a, time_b, slippage=0.040000000000000036):
    return abs(time_a - time_b) <= slippage

In [28]:
SAMPLE = "2442204240010034621"

In [34]:
from pathlib import Path
TEXT = Path("/Users/joregan/Playing/kbw2v")
PHONES = Path("/Users/joregan/Playing/rd_phonetic")

In [35]:
text = open(TEXT / f"{SAMPLE}_480p.json")
phones = open(PHONES / f"{SAMPLE}_480p.json")

In [36]:
import json
text_json = json.load(text)
phone_json = json.load(phones)

In [80]:
class Chunk:
    def __init__(self, chunk):
        self.text = chunk['text']
        self.start = chunk['timestamp'][0]
        self.end = chunk['timestamp'][1]
    
    def __repr__(self) -> str:
        return f"[{self.text} ({self.start}, {self.end})]"

class SimpleMerge(Chunk):
    def __init__(self, left: Chunk, right: Chunk):
        self.text = left.text
        self.phone = right.text
        self.start = left.start
        self.end = left.end
        self.diff_start = left.start - right.start
        self.diff_end = left.end - right.end

    def exact_length(self):
        return self.diff_start == 0 and self.diff_end == 0

    def __repr__(self) -> str:
        return f"[{self.text} :: {self.phone} ({self.start}, {self.end})]"

class ComplexMerge(Chunk):
    def __init__(self, left, right):
        if type(left) == list:
            self.left_chunks = left
        elif left == None or left == []:
            self.left_chunks == []
        else:
            self.left_chunks = [left]
        if type(right) == list:
            self.right_chunks = right
        elif right == None or right == []:
            self.right_chunks == []
        else:
            self.right_chunks = [right]

        if self.left_chunks[0].start < self.right_chunks[0].start:
            self.start = self.left_chunks[0].start
        else:
            self.start = self.right_chunks[0].start
        if self.left_chunks[-1].end > self.right_chunks[-1].end:
            self.end = self.left_chunks[-1].end
        else:
            self.end = self.right_chunks[-1].end

        self.text = " ".join([x.text for x in self.left_chunks])
        self.phone = " ".join([x.text for x in self.right_chunks])

    def __repr__(self) -> str:
        return f"[{self.text} :: {self.phone} ({self.start}, {self.end})]"

        

In [82]:
text_chunks = []
phone_chunks = []
for text_chunk in text_json['chunks']:
    text_chunks.append(Chunk(text_chunk))

for phone_chunk in phone_json['chunks']:
    phone_chunks.append(Chunk(phone_chunk))

In [96]:
tci = 0
pci = 0

merged = []

def idx_safe(tci, pci):
    return tci < len(text_chunks) and pci < len(phone_chunks)

while idx_safe(tci, pci):
    am_start = approx_match(text_chunks[tci].start, phone_chunks[pci].start)
    am_end = approx_match(text_chunks[tci].end, phone_chunks[pci].end)
    
    if am_start and am_end:
        merged.append(SimpleMerge(text_chunks[tci], phone_chunks[pci]))
    elif am_start:
        save_pci = pci
        peek = (pci + 1) < len(phone_chunks)
        lt_next = peek and phone_chunks[pci + 1].start < text_chunks[tci].end
        while idx_safe(tci, pci) and not approx_match(text_chunks[tci].end, phone_chunks[pci].end) and lt_next:
            pci += 1
        merged.append(ComplexMerge(text_chunks[tci], phone_chunks[save_pci:pci]))
    elif approx_match(text_chunks[tci].end, phone_chunks[pci].end):
        save_tci = tci
        while idx_safe(tci, pci) and not approx_match(text_chunks[tci].start, phone_chunks[pci].start) and text_chunks[tci].start > phone_chunks[pci].start:
            tci += 1
        merged.append(ComplexMerge(text_chunks[save_tci:tci], phone_chunks[tci]))
    else:
        print_pair(text_chunks[tci], phone_chunks[pci])
    tci += 1
    pci += 1

In [98]:
merged

[[IX :: iː seks debat meː anleːdnɪŋ ɑːv ɪntepelaɧuːnsvɑːr oː kamarən øːvərɡoːr nʉː tɪl at debateːra stasroːdən svɑːr poː møːtæɳa s ɪntepelaɧuːnər <pa> oː viː stɑːʈa meː arbesmarknas ɔk jemstɛlheːts mɪnɪstər eːva nuːɖmark sʊsjɑːldemɔkrɑːtæɳa s <hes> svɑːr poː ɪntepelaɧuːn tvoː hɵndra niːe <sm> ɑːv roːɡər hadad lɪbərɑːləɳa arbeːtsfœ̞meːdlɪɡəns tɪl<v>jeŋlɪɡheːt ɔk viː bœ̞rja meː mɪnɪstən vɑːʂoːuːɡuːd tak frʉː tɑːlman roːɡər hadad hɑːr froːɡat mej ɔm deː fɪns noːra kɔŋkreːta moːl ɵpsata fœ̞ː tɪljeŋlɪheːtən tɪl<v> arbesfœ̞meːdlɪŋən sɔm rɪjrɪŋən fœljər ɵp viːdarə hɑː n froːɡat <pa>vɑː jɑː anseːr æːr en rɪmlɪ nɪvoː fœ̞ːr hʉː snapt maŋ kaŋ kɵna kɔma iː kɔntakt meː en arbesfmeːdlarə <pa> samt foː en oːtekɔplɪŋ <sm><pa> ɧenærelt<v> <hes> set slʉːtlɪən hɑːr an froːɡat vɪlka oːtjæːɖə rɪeːrɪŋən ɑːv seː at viːtɑː fœ̞ːr at fœ̞ betra tɪljeŋlɪheːtən fœ̞ːr ɪnskriːvna hʊs arbesfmeːdlɪŋən ɧenerelt <pa> men ɪntə mɪnst fœ̞ːr pæʂuːnər meːd fɵŋkɧʊsnesɛtnɪŋ rɪjeːrɪŋən ɑː vɑːrɪt tyːdlɪ meː at <pa> arbesmaknaspʊ

In [62]:
class myfloat(float):
    def __eq__(self, other):
        return approx_match(self.real, other.real)
    def __hash__(self):
        return hash(self.real)

starts_a = [myfloat(x.start) for x in text_chunks]
starts_b = [myfloat(x.start) for x in phone_chunks]

In [63]:
from difflib import SequenceMatcher

In [64]:
sm = SequenceMatcher(a=starts_a, b=starts_b)

In [None]:
for op, a_start, a_end, b_start, b_end in sm.get_opcodes():
    print(op, a_start, a_end, b_start, b_end)