In [26]:
def approx_match(time_a, time_b, slippage=0.040000000000000036):
    return abs(time_a - time_b) <= slippage

In [28]:
SAMPLE = "2442204240010034621"

In [34]:
from pathlib import Path
TEXT = Path("/Users/joregan/Playing/kbw2v")
PHONES = Path("/Users/joregan/Playing/rd_phonetic")

In [35]:
text = open(TEXT / f"{SAMPLE}_480p.json")
phones = open(PHONES / f"{SAMPLE}_480p.json")

In [36]:
import json
text_json = json.load(text)
phone_json = json.load(phones)

In [50]:
class Chunk:
    def __init__(self, chunk):
        self.text = chunk['text']
        self.start = chunk['timestamp'][0]
        self.end = chunk['timestamp'][1]
    
    def __repr__(self) -> str:
        return f"[{self.text}: {self.start}, {self.end}]"

class SimpleMerge(Chunk):
    def __init__(self, left: Chunk, right: Chunk):
        self.text = left.text
        self.phone = right.text
        self.start = left.start
        self.end = left.end
        self.diff_start = left.start - right.start
        self.diff_end = left.end - right.end

    def exact_length(self):
        return self.diff_start == 0 and self.diff_end == 0
        

In [38]:
text_chunks = []
phone_chunks = []
for text_chunk in text_json['chunks']:
    text_chunks.append(Chunk(text_chunk))

for phone_chunk in phone_json['chunks']:
    phone_chunks.append(Chunk(phone_chunk))

In [51]:
def approx_match_chunks(left, right):
    return approx_match(left.start, right.start) and approx_match(left.end, right.end)

In [55]:
def print_pair(left, right):
    print(f"{left.text} :: {right.text} ({left.start}, {left.end}; {right.start}, {right.end})")

In [None]:
tci = 0
pci = 0

merged = []

while tci < len(text_chunks) and pci < len(phone_chunks):
    if approx_match_chunks(text_chunks[tci], phone_chunks[pci]):
        merged.append(SimpleMerge(text_chunks[tci], phone_chunks[pci]))
    elif approx_match(text_chunks[tci].start, phone_chunks[pci].start):
        save_pci = pci
        while not approx_match(text_chunks[tci].end, phone_chunks[pci].end):
            pci += 1
    else:
        print_pair(text_chunks[tci], phone_chunks[pci])
    tci += 1
    pci += 1

In [62]:
class myfloat(float):
    def __eq__(self, other):
        return approx_match(self.real, other.real)
    def __hash__(self):
        return hash(self.real)

starts_a = [myfloat(x.start) for x in text_chunks]
starts_b = [myfloat(x.start) for x in phone_chunks]

In [63]:
from difflib import SequenceMatcher

In [64]:
sm = SequenceMatcher(a=starts_a, b=starts_b)

In [65]:
for op, a_start, a_end, b_start, b_end in sm.get_opcodes():
    print(op, a_start, a_end, b_start, b_end)

equal 0 1 0 1
insert 1 1 1 2
equal 1 10 2 11
replace 10 11 11 12
equal 11 17 12 18
insert 17 17 18 19
equal 17 18 19 20
insert 18 18 20 21
equal 18 23 21 26
replace 23 24 26 29
equal 24 27 29 32
insert 27 27 32 34
equal 27 31 34 38
insert 31 31 38 41
equal 31 70 41 80
replace 70 71 80 81
equal 71 89 81 99
insert 89 89 99 100
equal 89 93 100 104
insert 93 93 104 105
equal 93 94 105 106
insert 94 94 106 107
equal 94 103 107 116
insert 103 103 116 117
equal 103 108 117 122
insert 108 108 122 123
equal 108 114 123 129
insert 114 114 129 130
equal 114 123 130 139
replace 123 124 139 140
equal 124 127 140 143
insert 127 127 143 144
equal 127 134 144 151
insert 134 134 151 152
equal 134 145 152 163
replace 145 146 163 165
equal 146 151 165 170
delete 151 152 170 170
equal 152 154 170 172
replace 154 155 172 173
equal 155 164 173 182
replace 164 165 182 183
equal 165 169 183 187
replace 169 172 187 193
equal 172 175 193 196
insert 175 175 196 197
equal 175 176 197 198
insert 176 176 198 199
eq