In [1]:
from pydub import AudioSegment

In [14]:
%%capture
%pip install pocketsphinx

In [31]:
MAPPING = """
ɑː AA
æ AE
ə AH
ɐ AH
ʌ AH
ɔː AO
aʊ AW
aɪ AY
b B
tʃ CH
d D
ð DH
ɛ EH
ɚ ER
ɜː ER
eɪ EY
f F
ɡ G
h HH
ɪ IH
i IY
iː IY
dʒ JH
k K
l L
m M
n N
ŋ NG
oʊ OW
ɔɪ OY
p P
ɹ R
s S
ʃ SH
t T
θ TH
ʊ UH
uː UW
v V
w W
j Y
z Z
ʒ ZH
ɾ D
"""

In [62]:
espeak_to_cmudict = {}
for line in MAPPING.split("\n"):
    if line == "":
        continue
    line = line.strip()
    parts = line.split(" ")

    if len(parts) != 2:
        print(line)
        continue
    k, v = line.split(" ")
    if not k in espeak_to_cmudict:
        espeak_to_cmudict[k] = v
    

In [75]:
import re

cmudict_keys = espeak_to_cmudict.keys()
cmudict_keys = sorted(cmudict_keys, key=len, reverse=True)
espeak_regex = re.compile(rf"({'|'.join(cmudict_keys)})")

def cmudictify(espeak):
    espeak = espeak.replace("ˈ", "").replace("ˌ", "")
    out = []
    pos = 0
    while(match := espeak_regex.search(espeak, pos)) is not None:
        print(match)
        pos = match.start() + 1
        out.append(espeak_to_cmudict[match[1]])
    return " ".join(out)

In [68]:
espeak_regex

re.compile(r'.*(ɑː|ɔː|aʊ|aɪ|tʃ|ɜː|eɪ|iː|dʒ|oʊ|ɔɪ|uː|æ|ə|ɐ|ʌ|b|d|ð|ɛ|ɚ|f|ɡ|h|ɪ|i|k|l|m|n|ŋ|p|ɹ|s|ʃ|t|θ|ʊ|v|w|j|z|ʒ|ɾ).*',
           re.UNICODE)

In [58]:
EGTEXT = "Yeah, that's true. I mean, they are the same size and they are a little bit, but I think I I should go more for something that style."
EGPHON = "/jˈæ ðˈæs tɹˈuː ə mˈiːn ðˈeɪ ɚ ðə sˈeɪm sˈaɪz ən ðˈeɪ ˈɑːɹ ə lˈɪɾə bˈɪɾ bˈʌt ˈaɪ θˈɪŋk ˈaɪ ˈaɪ ʃˈʊ ɡˈoʊ mˈɔːɹ fɚ sˈʌmθɪŋ ðˈæt stˈaɪl./"
EGFILE = "/Users/joregan/Playing/hsi/audio/hsi_7_0719_210_001_main.wav"
EGSTART = 70.028
EGEND = 75.441

In [72]:
def normword(text):
    text = text.strip(",.;:!?")
    return text.lower()

def normphon(phon):
    phon = phon.strip(",.;:!?")
    return phon

def make_lexicon(text, phon):
    if phon.startswith("/") and phon.endswith("/"):
        phon = phon[1:-1]
    words = [normword(x) for x in text.split(" ")]
    phonwords = [cmudictify(normphon(x)) for x in phon.split(" ")]
    assert len(words) == len(phonwords)
    output = list(set(zip(words, phonwords)))
    return output
    

In [76]:
cmudictify("ðeɪ")


<re.Match object; span=(0, 1), match='ð'>
<re.Match object; span=(1, 3), match='eɪ'>
<re.Match object; span=(2, 3), match='ɪ'>


'DH EY IH'

In [77]:
re.findall(espeak_regex, "ðeɪ")

['ð', 'eɪ']

In [36]:
make_lexicon(EGTEXT, EGPHON)

[('they', 'DH EY IH'),
 ('i', 'AH'),
 ('for', 'F ER'),
 ('more', 'M AO R'),
 ('size', 'S AY IH Z'),
 ('a', 'AH'),
 ('the', 'DH AH'),
 ('bit', 'B IH'),
 ('but', 'B AH T'),
 ('that', 'DH AE T'),
 ('yeah', 'Y AE'),
 ('are', 'ER'),
 ('something', 'S AH M TH IH NG'),
 ('little', 'L IH AH'),
 ("that's", 'DH AE S'),
 ('and', 'AH N'),
 ('are', 'AA R'),
 ('same', 'S EY IH M'),
 ('think', 'TH IH NG K'),
 ('true', 'T R UW'),
 ('mean', 'M IY N'),
 ('go', 'G OW UH'),
 ('style', 'S T AY IH L'),
 ('i', 'AY IH'),
 ('should', 'SH UH')]