In [1]:
from pydub import AudioSegment

In [14]:
%%capture
%pip install pocketsphinx

In [31]:
MAPPING = """
ɑː AA
æ AE
ə AH
ɐ AH
ʌ AH
ɔː AO
aʊ AW
aɪ AY
b B
tʃ CH
d D
ð DH
ɛ EH
ɚ ER
ɜː ER
eɪ EY
f F
ɡ G
h HH
ɪ IH
i IY
iː IY
dʒ JH
k K
l L
m M
n N
ŋ NG
oʊ OW
ɔɪ OY
p P
ɹ R
s S
ʃ SH
t T
θ TH
ʊ UH
uː UW
v V
w W
j Y
z Z
ʒ ZH
ɾ D
"""

In [62]:
espeak_to_cmudict = {}
for line in MAPPING.split("\n"):
    if line == "":
        continue
    line = line.strip()
    parts = line.split(" ")

    if len(parts) != 2:
        print(line)
        continue
    k, v = line.split(" ")
    if not k in espeak_to_cmudict:
        espeak_to_cmudict[k] = v
    

In [81]:
import re

cmudict_keys = espeak_to_cmudict.keys()
cmudict_keys = sorted(cmudict_keys, key=len, reverse=True)
espeak_regex = re.compile(rf"({'|'.join(cmudict_keys)})")

def cmudictify(espeak):
    espeak = espeak.replace("ˈ", "").replace("ˌ", "")
    return " ".join([espeak_to_cmudict[x] for x in re.findall(espeak_regex, espeak)])

In [58]:
EGTEXT = "Yeah, that's true. I mean, they are the same size and they are a little bit, but I think I I should go more for something that style."
EGPHON = "/jˈæ ðˈæs tɹˈuː ə mˈiːn ðˈeɪ ɚ ðə sˈeɪm sˈaɪz ən ðˈeɪ ˈɑːɹ ə lˈɪɾə bˈɪɾ bˈʌt ˈaɪ θˈɪŋk ˈaɪ ˈaɪ ʃˈʊ ɡˈoʊ mˈɔːɹ fɚ sˈʌmθɪŋ ðˈæt stˈaɪl./"
EGFILE = "/Users/joregan/Playing/hsi/audio/hsi_7_0719_210_001_main.wav"
EGSTART = 70.028
EGEND = 75.441

In [79]:
def normword(text):
    text = text.strip(",.;:!?")
    return text.lower()

def normphon(phon):
    phon = phon.strip(",.;:!?")
    return phon

def make_lexicon(text, phon):
    if phon.startswith("/") and phon.endswith("/"):
        phon = phon[1:-1]
    words = [normword(x) for x in text.split(" ")]
    phonwords = [cmudictify(normphon(x)) for x in phon.split(" ")]
    assert len(words) == len(phonwords)
    output = list(set(zip(words, phonwords)))
    return output
    

In [83]:
lex = make_lexicon(EGTEXT, EGPHON)

In [141]:
audio = AudioSegment.from_file(EGFILE)

In [142]:
audio = audio.set_frame_rate(16000)

In [143]:
seg = audio[int(EGSTART * 1000):int(EGEND * 1000)]

In [145]:
seg.get_array_of_samples('B')

array('B', [253, 165, 234, 255, 79, 71, 230, 255, 185, 248, 215, 255, 88, 90, 211, 255, 158, 169, 208, 255, 207, 138, 217, 255, 177, 117, 218, 255, 228, 194, 220, 255, 253, 201, 224, 255, 21, 164, 229, 255, 92, 225, 231, 255, 21, 164, 229, 255, 198, 128, 222, 255, 108, 11, 240, 255, 189, 154, 240, 255, 51, 113, 248, 255, 153, 65, 238, 255, 11, 55, 250, 255, 164, 129, 241, 255, 2, 135, 253, 255, 248, 61, 8, 0, 140, 104, 9, 0, 35, 255, 3, 0, 177, 202, 20, 0, 39, 197, 18, 0, 18, 141, 15, 0, 158, 236, 15, 0, 226, 162, 20, 0, 27, 70, 21, 0, 75, 21, 12, 0, 170, 107, 13, 0, 1, 247, 13, 0, 75, 3, 17, 0, 31, 39, 17, 0, 240, 177, 1, 0, 25, 7, 4, 0, 201, 236, 238, 255, 146, 172, 245, 255, 129, 85, 238, 255, 222, 135, 226, 255, 220, 0, 229, 255, 7, 194, 224, 255, 66, 227, 213, 255, 121, 255, 216, 255, 185, 19, 220, 255, 51, 176, 219, 255, 135, 189, 231, 255, 231, 28, 242, 255, 112, 218, 240, 255, 40, 76, 249, 255, 56, 73, 2, 0, 133, 211, 16, 0, 237, 42, 4, 0, 170, 107, 13, 0, 116, 52, 29, 0, 145, 

In [128]:
def make_ps_dict(entries):
    counts = {}
    output = []
    lex = sorted(entries)
    for entry in lex:
        count = 1
        if not entry[0] in counts:
            counts[entry[0]] = 1
        else:
            counts[entry[0]] += 1
            count = counts[entry[0]]
        if count != 1:
            subscript = f"({count})"
        else:
            subscript = ""
        output.append(f"{entry[0]}{subscript} {entry[1]}")
    return output

In [97]:
def make_fsg_transitions_from_text(text):
    words = [normword(x) for x in text.split(" ")]
    enum = [x for x in enumerate(words)]
    trans = [(x[0], x[0] + 1, 1.0, x[1]) for x in enum]
    return trans

In [99]:
fsgt = make_fsg_transitions_from_text(EGTEXT)

In [100]:
start_state = fsgt[0][0]
end_state = fsgt[-1][1]

In [160]:
import pocketsphinx

In [161]:
import tempfile
entries = make_ps_dict(lex)

with (
    tempfile.NamedTemporaryFile(suffix=".dict") as dictf,
    tempfile.NamedTemporaryFile(suffix=".wav") as wavf,
):
    with open(dictf.name, "w") as dictout:
        for entry in entries:
            dictout.write(entry + "\n")
    
    seg.export(wavf.name, format="wav")

    decoder = pocketsphinx.Decoder(lm=None, dict=dictf.name)
    fsg = decoder.create_fsg("dummy", start_state, end_state, fsgt)
    decoder.add_fsg("dummy", fsg)
    decoder.activate_search("dummy")
    decoder.start_utt()
    decoder.process_raw(wavf.read(), full_utt=True)
    decoder.end_utt()

In [155]:
import tempfile
entries = make_ps_dict(lex)

with open("/tmp/testing.dict", "w") as dictout:
    for entry in entries:
        dictout.write(entry + "\n")

decoder = pocketsphinx.Decoder(lm=None, dict="/tmp/testing.dict")
fsg = decoder.create_fsg("dummy", start_state, end_state, fsgt)
decoder.add_fsg("dummy", fsg)
decoder.activate_search("dummy")

In [156]:
decoder.start_utt()
decoder.process_raw(seg.get_array_of_samples('B'))

In [157]:
decoder.end_utt()

In [162]:
decoder.seg()

ERROR: "fsg_search.c", line 944: Final result does not match the grammar in frame 1082
