# Vosk CLI stderr output to CTM

> "Because it was quicker than looking at the API examples"

- branch: master
- comments: false
- categories: [vosk, ctm, kludge]

In [23]:
VOSKDIR = "/Users/joregan/hsi-vosk/"
GOOGDIR = "/Users/joregan/Playing/hsi_google/"

In [24]:
from pathlib import Path

vosk_path = Path(VOSKDIR)
goog_path = Path(GOOGDIR)

In [5]:
def get_recognition(filename):
    segments = []
    with open(filename) as inf:
        for line in inf.readlines():
            if line.startswith("INFO:root:{'result':"):
                text = line.strip()[10:]
                data = eval(text)
                segments.append(data)
    return segments

In [1]:
def clean_filename(filename):
    if type(filename) == str:
        filename = Path(filename).stem
    elif type(filename) == Path:
        filename = filename.stem
    return filename

In [112]:
_HOMOPHONES = """
there their they're
ah uh
"""

In [128]:
homophones = {}
for line in _HOMOPHONES.split("\n"):
    line = line.strip()
    if line == "":
        continue
    words = line.split(" ")
    for i, _ in enumerate(words):
        for j, _ in enumerate(words):
            if j == i:
                continue
            l = words[i]
            r = words[j]
            if not l in homophones:
                homophones[l] = []
            homophones[l].append(r)

def are_homophones(a, b):
    return a in homophones and b in homophones[a]

In [123]:
from dataclasses import dataclass, field
DIGITS = {
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "nine"
}

def times_ap_eq(a, b, fudge=0.8):
    diff = a - b
    return abs(diff) < fudge

def norm_str_eq(a, b, punct="!.,;:?"):
    norm_a = a.strip(punct).lower()
    norm_b = b.strip(punct).lower()
    if norm_a in DIGITS:
        return DIGITS[norm_a] == norm_b
    if are_homophones(norm_a, norm_b):
        return True
    return norm_a == norm_b


@dataclass
class TimedWord:
    text: str = field(compare=True)
    start: float = field(compare=True)
    end: float = field(compare=True)
    pos: str = field(compare=False, default="M")

    def __lt__(self, o):
        return self.start < o.start
    
    def __gt__(self, o):
        return self.end > o.end

    def __eq__(self, o):
        t = norm_str_eq(self.text, o.text)
        s = times_ap_eq(self.start, o.start)
        e = times_ap_eq(self.end, o.end)
        return t and s and e

    def __hash__(self) -> int:
        return hash(self.text.strip("!.,;:?").lower())

In [87]:
def get_arrayrefs_from_vosk(rec):
    pointers = {}
    for c, data in enumerate(rec):
        for w in data["result"]:
            tw = TimedWord(w["word"], w["start"], w["end"])
            pointers[tw] = c
    return pointers

In [102]:
def get_arrayrefs_from_google(data):
    c = 0
    pointers = {}
    list_big = []
    for result in data["results"]:
        list_small = []
        if not "alternatives" in result:
            continue
        if not len(result["alternatives"]) == 1:
            continue
        if not "words" in result["alternatives"][0]:
            continue
        for w in result["alternatives"][0]["words"]:
            start = w["startTime"]
            end = w["endTime"]
            if start.endswith("s"):
                start = start[:-1]
            start = float(start)
            if end.endswith("s"):
                end = end[:-1]
            end = float(end)
            tw = TimedWord(w["word"], start, end)
            pointers[tw] = c
            list_small.append(tw)
        c += 1
        list_big.append(list_small)
    for inner in list_big:
        if len(inner) == 1:
            inner[0].pos = "I"
        else:
            inner[0].pos = "S"
            inner[-1].pos = "E"

    return pointers

In [25]:
import json

for vosk_file in vosk_path.glob("*.vosk"):
    stem = clean_filename(vosk_file)
    goog_file = goog_path / f"{stem}.json"
    if not goog_file.exists():
        continue
    with open(str(goog_file)) as googf:
        goog_data = json.load(googf)
        goog_refs = get_arrayrefs_from_google(goog_data)
        vosk_data = get_recognition(vosk_file)
        vosk_refs = get_arrayrefs_from_vosk(vosk_data)

In [124]:
vosk_sample = "/Users/joregan/hsi-vosk/hsi_5_0718_210_002_main.wav.vosk"
goog_sample = "/Users/joregan/Playing/hsi_google/hsi_5_0718_210_002_main.json"
with open(goog_sample) as googf:
    goog_data = json.load(googf)
goog_refs = get_arrayrefs_from_google(goog_data)
vosk_data = get_recognition(vosk_sample)
vosk_refs = get_arrayrefs_from_vosk(vosk_data)


In [137]:
class MatchWord:
    def __init__(self, text, start, end, pos="M", op="equal") -> None:
        self.text = text
        self.start = start
        self.end = end
        self.pos = pos
        self.op = op
        self.tokens = []

    def _match_any_text(self, o):
        if self.tokens != []:
            for tok in self.tokens:
                if norm_str_eq(tok.text, o.text):
                    return True
            return False
        else:
            return norm_str_eq(self.text, o.text)
    
    def add_tokens(self, tokens):
        if type(tokens) == list:
            self.tokens += tokens
        else:
            self.tokens.append(tokens)

    def __lt__(self, o):
        return self.start < o.start
    
    def __gt__(self, o):
        return self.end > o.end

    def __eq__(self, o):
        t = norm_str_eq(self.text, o.text)
        s = times_ap_eq(self.start, o.start)
        e = times_ap_eq(self.end, o.end)
        return t and s and e

    def __hash__(self) -> int:
        return hash(self.text.strip("!.,;:?").lower())
    
    def __repr__(self) -> str:
        if self.tokens != []:
            toks = f', ({", ".join([x.text for x in self.tokens])})'
        else:
            toks = ''
        return f'MatchWord({self.text}, {self.start}, {self.end}, {self.pos}, {self.op}{toks})'

In [126]:
def proc_equal(lista, listb):
    for a, b in zip(lista, listb):
        if a.pos in ["S", "I"]:
            a.start = b.start
        if a.pos in ["E", "I"]:
            a.end = b.end

In [132]:
def is_simple_replace(lista, listb):
    if len(lista) == len(listb) == 1:
        if lista[0].pos == "M" and norm_str_eq(lista[0].text, listb[0].text):
            return True
        else:
            return False
    else:
        return False

In [133]:
from difflib import SequenceMatcher

def do_single_replace(r1, r2):
    if norm_str_eq(r1.text, r2.text):
        return MatchWord(r1.text, r2.start, r2.end, r1.pos, "equal")
    else:
        tmp = MatchWord(r1.text, r1.start, r1.end, r1.pos, "replace")
        tmp.add_tokens(r1)
        tmp.add_tokens(r2)
        return tmp

def proc_seqmatch(goog_refs, vosk_refs):
    a = list(goog_refs.keys())
    b = list(vosk_refs.keys())
    s = SequenceMatcher(None, a, b)

    matches = []
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == "equal":
            proc_equal(a[i1:i2], b[j1:j2])
            for m in a[i1:i2]:
                matches.append(MatchWord(m.text, m.start, m.end, m.pos, "equal"))
        elif tag == "delete":
            for m in a[i1:i2]:
                tmp = MatchWord(m.text, m.start, m.end, m.pos, "delete")
                tmp.add_tokens(m)
                matches.append(tmp)
        elif tag == "insert":
            for m in b[j1:j2]:
                tmp = MatchWord(m.text, m.start, m.end, m.pos, "insert")
                tmp.add_tokens(m)
                matches.append(tmp)
        elif tag =="replace":
            if is_simple_replace(a[i1:i2], b[j1:j2]):
                proc_equal(a[i1:i2], b[j1:j2])
                for m in a[i1:i2]:
                    matches.append(MatchWord(m.text, m.start, m.end, m.pos, "equal"))
            # the fun part
            if len(a[i1:i2]) == len(b[j1:j2]):
                for r1, r2 in zip(a[i1:i2], b[j1:j2]):
                    matches.append(do_single_replace(r1, r2))
            else:
                l1 = len(a[i1:i2])
                l2 = len(b[j1:j2])
                shorter = min(l1, l2)
                i = 0
                for r1, r2 in zip(a[i1:i2], b[j1:j2]):
                    matches.append(do_single_replace(r1, r2))
                if l1 > l2:
                    m = a[i1:i2][-1]
                    tmp = MatchWord(m.text, m.start, m.end, m.pos, "delete")
                    tmp.add_tokens(m)
                    matches.append(tmp)
                else:
                    m = b[j1:j2][-1]
                    tmp = MatchWord(m.text, m.start, m.end, m.pos, "insert")
                    tmp.add_tokens(m)
                    matches.append(tmp)
    return matches


In [138]:
proc_seqmatch(goog_refs, vosk_refs)

[MatchWord(you, 0.0, 0.18, M, insert, (you)),
 MatchWord(can, 0.18, 0.27, M, insert, (can)),
 MatchWord(take, 0.272088, 0.451059, M, insert, (take)),
 MatchWord(it, 0.451059, 0.51, M, insert, (it)),
 MatchWord(Strange., 4.17, 4.89, I, equal),
 MatchWord(Oh, 18.03, 18.7, S, equal),
 MatchWord(sorry., 18.7, 19.14, E, equal),
 MatchWord(Yeah., 23.01, 23.2, S, equal),
 MatchWord(Yeah., 23.8, 23.9, E, replace, (Yeah., maybe)),
 MatchWord(Yeah., 25.6, 27.2, S, delete, (Yeah.)),
 MatchWord(Yeah., 27.2, 27.39, E, equal),
 MatchWord(And, 49.0, 50.3, S, replace, (And, this)),
 MatchWord(and, 50.13, 50.31, M, insert, (and)),
 MatchWord(you're, 50.3, 50.8, M, equal),
 MatchWord(going, 50.8, 50.8, M, equal),
 MatchWord(to, 50.8, 50.9, M, equal),
 MatchWord(rent, 50.9, 51.2, M, equal),
 MatchWord(it., 51.2, 51.4, M, equal),
 MatchWord(Oh,, 51.4, 52.1, M, equal),
 MatchWord(could, 52.1, 52.5, M, equal),
 MatchWord(I, 52.5, 52.6, M, equal),
 MatchWord(have, 52.6, 52.7, M, equal),
 MatchWord(some, 52.7