# Vosk CLI stderr output to CTM

> "Because it was quicker than looking at the API examples"

- branch: master
- comments: false
- categories: [vosk, ctm, kludge]

In [23]:
VOSKDIR = "/Users/joregan/hsi-vosk/"
GOOGDIR = "/Users/joregan/Playing/hsi_google/"

In [24]:
from pathlib import Path

vosk_path = Path(VOSKDIR)
goog_path = Path(GOOGDIR)

In [5]:
def get_recognition(filename):
    segments = []
    with open(filename) as inf:
        for line in inf.readlines():
            if line.startswith("INFO:root:{'result':"):
                text = line.strip()[10:]
                data = eval(text)
                segments.append(data)
    return segments

In [1]:
def clean_filename(filename):
    if type(filename) == str:
        filename = Path(filename).stem
    elif type(filename) == Path:
        filename = filename.stem
    return filename

In [112]:
_HOMOPHONES = """
there their they're
ah uh
"""

In [128]:
homophones = {}
for line in _HOMOPHONES.split("\n"):
    line = line.strip()
    if line == "":
        continue
    words = line.split(" ")
    for i, _ in enumerate(words):
        for j, _ in enumerate(words):
            if j == i:
                continue
            l = words[i]
            r = words[j]
            if not l in homophones:
                homophones[l] = []
            homophones[l].append(r)

def are_homophones(a, b):
    return a in homophones and b in homophones[a]

In [123]:
from dataclasses import dataclass, field
DIGITS = {
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "nine"
}

def times_ap_eq(a, b, fudge=0.8):
    diff = a - b
    return abs(diff) < fudge

def norm_str_eq(a, b, punct="!.,;:?"):
    norm_a = a.strip(punct).lower()
    norm_b = b.strip(punct).lower()
    if norm_a in DIGITS:
        return DIGITS[norm_a] == norm_b
    if are_homophones(norm_a, norm_b):
        return True
    return norm_a == norm_b


@dataclass
class TimedWord:
    text: str = field(compare=True)
    start: float = field(compare=True)
    end: float = field(compare=True)
    pos: str = field(compare=False, default="M")

    def __lt__(self, o):
        return self.start < o.start
    
    def __gt__(self, o):
        return self.end > o.end

    def __eq__(self, o):
        t = norm_str_eq(self.text, o.text)
        s = times_ap_eq(self.start, o.start)
        e = times_ap_eq(self.end, o.end)
        return t and s and e

    def __hash__(self) -> int:
        return hash(self.text.strip("!.,;:?").lower())

In [87]:
def get_arrayrefs_from_vosk(rec):
    pointers = {}
    for c, data in enumerate(rec):
        for w in data["result"]:
            tw = TimedWord(w["word"], w["start"], w["end"])
            pointers[tw] = c
    return pointers

In [102]:
def get_arrayrefs_from_google(data):
    c = 0
    pointers = {}
    list_big = []
    for result in data["results"]:
        list_small = []
        if not "alternatives" in result:
            continue
        if not len(result["alternatives"]) == 1:
            continue
        if not "words" in result["alternatives"][0]:
            continue
        for w in result["alternatives"][0]["words"]:
            start = w["startTime"]
            end = w["endTime"]
            if start.endswith("s"):
                start = start[:-1]
            start = float(start)
            if end.endswith("s"):
                end = end[:-1]
            end = float(end)
            tw = TimedWord(w["word"], start, end)
            pointers[tw] = c
            list_small.append(tw)
        c += 1
        list_big.append(list_small)
    for inner in list_big:
        if len(inner) == 1:
            inner[0].pos = "I"
        else:
            inner[0].pos = "S"
            inner[-1].pos = "E"

    return pointers

In [25]:
import json

for vosk_file in vosk_path.glob("*.vosk"):
    stem = clean_filename(vosk_file)
    goog_file = goog_path / f"{stem}.json"
    if not goog_file.exists():
        continue
    with open(str(goog_file)) as googf:
        goog_data = json.load(googf)
        goog_refs = get_arrayrefs_from_google(goog_data)
        vosk_data = get_recognition(vosk_file)
        vosk_refs = get_arrayrefs_from_vosk(vosk_data)

In [124]:
vosk_sample = "/Users/joregan/hsi-vosk/hsi_5_0718_210_002_main.wav.vosk"
goog_sample = "/Users/joregan/Playing/hsi_google/hsi_5_0718_210_002_main.json"
with open(goog_sample) as googf:
    goog_data = json.load(googf)
goog_refs = get_arrayrefs_from_google(goog_data)
vosk_data = get_recognition(vosk_sample)
vosk_refs = get_arrayrefs_from_vosk(vosk_data)


In [47]:
a = TimedWord(text='strange', start=4.17, end=4.89)
b = TimedWord(text='Strange.', start=4.2, end=4.8)

In [126]:
def proc_equal(lista, listb):
    for a, b in zip(lista, listb):
        if a.pos in ["S", "I"]:
            a.start = b.start
        if a.pos in ["E", "I"]:
            a.end = b.end

In [130]:
def simple_replace(lista, listb):
    if len(lista) == len(listb) == 1:
        if lista[0].pos == "M" and norm_str_eq(lista[0].text, listb[0].text):
            proc_equal(lista, listb)

In [None]:
from difflib import SequenceMatcher

def proc_seqmatch(goog_refs, vosk_refs):
    a = list(goog_refs.keys())
    b = list(vosk_refs.keys())
    s = SequenceMatcher(None, a, b)

    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == "equal":
            proc_equal(a[i1:i2], b[j1:j2])
            


In [127]:
for tag, i1, i2, j1, j2 in s.get_opcodes():
    print('{:7}   a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}\n'.format(tag, i1, i2, j1, j2, a[i1:i2], b[j1:j2]))

insert    a[0:0] --> b[0:4]       [] --> [TimedWord(text='you', start=0.0, end=0.18, pos='M'), TimedWord(text='can', start=0.18, end=0.27, pos='M'), TimedWord(text='take', start=0.272088, end=0.451059, pos='M'), TimedWord(text='it', start=0.451059, end=0.51, pos='M')]

equal     a[0:4] --> b[4:8] [TimedWord(text='Strange.', start=4.2, end=4.8, pos='I'), TimedWord(text='Oh', start=17.5, end=18.7, pos='S'), TimedWord(text='sorry.', start=18.7, end=19.2, pos='E'), TimedWord(text='Yeah.', start=22.9, end=23.2, pos='S')] --> [TimedWord(text='strange', start=4.17, end=4.89, pos='M'), TimedWord(text='oh', start=18.03, end=18.244142, pos='M'), TimedWord(text='sorry', start=18.36, end=19.14, pos='M'), TimedWord(text='yeah', start=23.01, end=23.28, pos='M')]

replace   a[4:6] --> b[8:9] [TimedWord(text='Yeah.', start=23.8, end=23.9, pos='E'), TimedWord(text='Yeah.', start=25.6, end=27.2, pos='S')] --> [TimedWord(text='maybe', start=26.28, end=26.49, pos='M')]

equal     a[6:7] --> b[9:10] [Timed

In [None]:
class MatchWord:
    def __init__(self, text, start, end, pos="M", op="equal") -> None:
        self.text = text
        self.start = start
        self.end = end
        self.pos = pos
        self.op = op
        self.tokens = []

    def _match_any_text(self, o):
        if self.tokens != []:
            for tok in self.tokens:
                if norm_str_eq(tok.text, o.text):
                    return True
            return False
        else:
            return norm_str_eq(self.text, o.text)
    
    def add_tokens(self, tokens):
        if type(tokens) == list:
            self.tokens += tokens
        else:
            self.tokens.append(tokens)

    def __lt__(self, o):
        return self.start < o.start
    
    def __gt__(self, o):
        return self.end > o.end

    def __eq__(self, o):
        t = norm_str_eq(self.text, o.text)
        s = times_ap_eq(self.start, o.start)
        e = times_ap_eq(self.end, o.end)
        return t and s and e

    def __hash__(self) -> int:
        return hash(self.text.strip("!.,;:?").lower())