In [3]:
import json
from pathlib import Path

In [33]:
JSONDIR = "/tmp/reassemble/"
TSVDIR = "/tmp/word-tsv"

In [34]:
JSONPATH = Path(JSONDIR)
TSVPATH = Path(TSVDIR)

In [35]:
if not TSVPATH.is_dir():
    TSVPATH.mkdir()

In [36]:
def read_json(filename):
    out = []
    with open(filename) as inf:
        data = json.load(inf)
    for seg in data["segments"]:
        out.append({
            "start": seg["start"],
            "end": seg["end"],
            "word": seg["text"]
        })
    return out

In [46]:
filepieces = {}
for file in JSONPATH.glob("*.json"):
    filestem = file.stem
    if not filestem.startswith("hsi_"):
        continue
    pieces = filestem.split("_")
    if len(pieces) != 8:
        continue
    base = "_".join(pieces[0:6])
    if not base in filepieces:
        filepieces[base] = []

    startint = int(pieces[6])
    endint = int(pieces[7])
    words = read_json(str(file))
    for word in words:
        w = word["word"].strip()
        if w.startswith("[") or w.endswith("]"):
            continue
        start = (float(startint) / 1000.0) + word["start"]
        end = (float(startint) / 1000.0) + word["end"]
        filepieces[base].append((start, end, w))

In [50]:
for piece in filepieces:
    pieces = filepieces[piece]
    pieces_sorted = sorted(pieces, key=lambda x: x[0])
    outfile = TSVPATH / f"{piece}.tsv"
    with open(outfile, "w") as of:
        for item in pieces_sorted:
            start_ms = int(item[0] * 1000)
            end_ms = int(item[1] * 1000)
            if start_ms < 1000:
                start_whole = 0
            else:
                start_whole = str(start_ms)[:-3]
            start_dec = str(start_ms)[-3:].zfill(3)
            if end_ms < 1000:
                end_whole = 0
            else:
                end_whole = str(end_ms)[:-3]
            end_dec = str(end_ms)[-3:].zfill(3)
            start_fmt = f"{start_whole}.{start_dec}"
            end_fmt = f"{end_whole}.{end_dec}"

            of.write(f"{start_fmt}\t{end_fmt}\t{item[2]}\n")