# Trying to use pocketsphinx to word align

> "Because timing accuracy in ASR is getting progressively worse, look backwards"

- branch: master
- comments: false
- categories: [pocketsphinx, hsi, alignment]

In [1]:
from pydub import AudioSegment
import pocketsphinx
import tempfile

In [2]:
MAPPING = """
ɑː AA
æ AE
ə AH
ɐ AH
ʌ AH
ɔː AO
aʊ AW
aɪ AY
b B
tʃ CH
d D
ð DH
ɛ EH
ɚ ER
ɜː ER
eɪ EY
f F
ɡ G
h HH
ɪ IH
i IY
iː IY
dʒ JH
k K
l L
m M
n N
ŋ NG
oʊ OW
ɔɪ OY
p P
ɹ R
s S
ʃ SH
t T
θ TH
ʊ UH
uː UW
v V
w W
j Y
z Z
ʒ ZH
ɾ D
"""

In [3]:
espeak_to_cmudict = {}
for line in MAPPING.split("\n"):
    if line == "":
        continue
    line = line.strip()
    parts = line.split(" ")

    if len(parts) != 2:
        print(line)
        continue
    k, v = line.split(" ")
    if not k in espeak_to_cmudict:
        espeak_to_cmudict[k] = v
    

In [4]:
import re

cmudict_keys = espeak_to_cmudict.keys()
cmudict_keys = sorted(cmudict_keys, key=len, reverse=True)
espeak_regex = re.compile(rf"({'|'.join(cmudict_keys)})")

def cmudictify(espeak):
    espeak = espeak.replace("ˈ", "").replace("ˌ", "")
    return " ".join([espeak_to_cmudict[x] for x in re.findall(espeak_regex, espeak)])

In [40]:
EGTEXT = "Yeah, that's true. I mean, they are the same size and they are a little bit, but I think I I should go more for something that style."
EGPHON = "/jˈæ ðˈæs tɹˈuː ə mˈiːn ðˈeɪ ɚ ðə sˈeɪm sˈaɪz ən ðˈeɪ ɚ ə lˈɪɾə bˈɪɾ bˈʌt ˈaɪ θˈɪŋk ˈaɪ ˈaɪ ʃˈʊ ɡˈoʊ mˈɔːɹ fɚ sˈʌmθɪŋ ðˈæt stˈaɪl./"
EGFILE = "/Users/joregan/Playing/hsi/audio/hsi_7_0719_210_001_main.wav"
EGSTART = 70.028
EGEND = 75.441

In [84]:
EGTEXT = "no it's a cheetah, ah, yes it's a cheetah ah and, uh, that one is from, uh, India and, uh, it's really"
EGPHON = "/nˈoʊ ɪts ɐ tʃˈiːt ˈɑːɛ jˈɑs ɪts ɐ tʃˈiːtə ˈɑː ænd ˈɛ ðˈæt wˌʌn ɪz fɹˈɑːm ˈɛ ˈɪndiæ ænd ˈɛ ɪts ɹˈili/"
EGFILE = "/Users/joregan/Playing/hsi/audio/hsi_5_0718_211_002_main.wav"

EGTEXT2 = "So it was standing somewhere in the middle of the room and it was flying around."
EGSTART2 = 122.709
EGEND2 = 126.344

In [7]:
def normword(text):
    text = text.strip(",.;:!?")
    return text.lower()

def normphon(phon):
    phon = phon.strip(",.;:!?")
    return phon

def make_lexicon(text, phon):
    if phon.startswith("/") and phon.endswith("/"):
        phon = phon[1:-1]
    words = [normword(x) for x in text.split(" ")]
    phonwords = [cmudictify(normphon(x)) for x in phon.split(" ")]
    assert len(words) == len(phonwords)
    output = list(set(zip(words, phonwords)))
    return output
    

In [8]:
def make_ps_dict(entries):
    counts = {}
    output = []
    lex = sorted(entries)
    for entry in lex:
        count = 1
        if not entry[0] in counts:
            counts[entry[0]] = 1
        else:
            counts[entry[0]] += 1
            count = counts[entry[0]]
        if count != 1:
            subscript = f"({count})"
        else:
            subscript = ""
        output.append(f"{entry[0]}{subscript} {entry[1]}")
    return output

In [9]:
def make_fsg_transitions_from_text(text):
    words = [normword(x) for x in text.split(" ")]
    enum = [x for x in enumerate(words)]
    trans = [(x[0], x[0] + 1, 1.0, x[1]) for x in enum]
    return trans

In [69]:
def espeakify(word):
    out=!echo {word}|espeak --ipa -v en-us -q
    filt = [x for x in out if x != ""]
    if len(filt) == 1:
        return filt[0].strip()
    else:
        return " ".join(filt).strip()

In [101]:
def lex_from_espeak(text, overrides=None, concat=False):
    words = [normword(x) for x in text.split(" ")]
    default = {x: espeakify(x) for x in words}
    res = []
    # [ for x in phon.split(" ")]
    for word in words:
        if overrides is None or word not in overrides:
            cur = (word, cmudictify(normphon(default[word])))
            if cur not in res:
                res.append(cur)
        else:
            if concat:
                x = (word, cmudictify(normphon(default[word])))
                if not x in res:
                    res.append(x)
            if type(overrides[word]) is list:
                for pron in overrides[word]:
                    x = (word, cmudictify(normphon(pron)))
                    if not x in res:
                        res.append(x)
            else:
                x = (word, cmudictify(normphon(overrides[word])))
                if not x in res:
                    res.append(x)
    return res

In [91]:
audio = AudioSegment.from_file(EGFILE)
audio = audio.set_frame_rate(16000)

In [106]:
def align_with_pocketsphinx(text, phones, audio, start, end, overrides=None, concat_dict=False):
    if not phones or phones == "":
        lex = lex_from_espeak(text, overrides, concat_dict)
    else:
        lex = make_lexicon(text, phones)
    entries = make_ps_dict(lex)

    segments = []

    with (
        tempfile.NamedTemporaryFile(suffix=".dict", delete=False) as dictf,
        tempfile.NamedTemporaryFile(suffix=".raw", delete=False) as wavf,
    ):

        with open(dictf.name, "w") as dictout:
            for entry in entries:
                dictout.write(entry + "\n")

        istart = int(start * 1000)
        iend = int(end * 1000)
        audioseg = audio[istart:iend]
        audioseg.export(wavf.name, format="s16le", parameters=["-ac", "1", "-acodec", "pcm_s16le", "-f", "s16le", "-ar", "16000"])

        fsgt = make_fsg_transitions_from_text(text)
        start_state = fsgt[0][0]
        end_state = fsgt[-1][1]

        decoder = pocketsphinx.Decoder(lm=None, dict=dictf.name)

        fsg = decoder.create_fsg("dummy", start_state, end_state, fsgt)
        decoder.add_fsg("dummy", fsg)
        decoder.activate_search("dummy")

        decoder.start_utt()
        decoder.process_raw(wavf.read(), full_utt=True)
        decoder.end_utt()

        for seg in decoder.seg():
            segments.append({
                "word": seg.word,
                "start_ms": (seg.start_frame * 10) + istart,
                "end_ms": (seg.end_frame * 10) + istart,
                "phones": decoder.lookup_word(seg.word)
            })

    return segments

In [107]:
segs = align_with_pocketsphinx(EGTEXT2, "", audio, EGSTART2, EGEND2)

In [108]:
segs

[{'word': 'so', 'start_ms': 122709, 'end_ms': 122789, 'phones': 'S OW'},
 {'word': 'it', 'start_ms': 122799, 'end_ms': 122899, 'phones': 'IH T'},
 {'word': 'was', 'start_ms': 122909, 'end_ms': 122989, 'phones': 'W AH Z'},
 {'word': 'standing',
  'start_ms': 122999,
  'end_ms': 123589,
  'phones': 'S T AE N D IH NG'},
 {'word': '<sil>', 'start_ms': 123599, 'end_ms': 123809, 'phones': 'SIL'},
 {'word': 'somewhere',
  'start_ms': 123819,
  'end_ms': 124219,
  'phones': 'S AH M W EH R'},
 {'word': 'in', 'start_ms': 124229, 'end_ms': 124319, 'phones': 'IH N'},
 {'word': 'the', 'start_ms': 124329, 'end_ms': 124389, 'phones': 'DH AH'},
 {'word': 'middle',
  'start_ms': 124399,
  'end_ms': 124599,
  'phones': 'M IH D AH L'},
 {'word': 'of', 'start_ms': 124609, 'end_ms': 124739, 'phones': 'AH V'},
 {'word': 'the', 'start_ms': 124749, 'end_ms': 124839, 'phones': 'DH AH'},
 {'word': 'room', 'start_ms': 124849, 'end_ms': 125179, 'phones': 'R UW M'},
 {'word': 'and', 'start_ms': 125189, 'end_ms': 1