# Extract phones from TextGrid

> Incomplete, needs work

- branch: master
- badges: false
- hidden: true
- categories: [hsi, textgrid]

The input is the output from MFA, so tiers have specific names.

In [14]:
from praatio import textgrid

def get_phone_sequences(tgfile):
    tg = textgrid.openTextgrid(tgfile, False)

    def get_tier_list(tiername):
        utterances = []
        tier = tg.getTier(tiername)
        for entry in tier.entries:
            text = entry.label.strip()
            if text == "":
                continue
            utterance = {}
            utterance["start"] = entry.start
            utterance["end"] = entry.end
            utterance["text"] = text
            utterances.append(utterance)
        return utterances
    
    utterances = get_tier_list("utterances")
    words = get_tier_list("words")
    phones = get_tier_list("phones")

    i = j = k = 0
    while i < len(utterances):
        utterance = utterances[i]
        utterance["words"] = []

        while j < len(words) and words[j]["end"] <= utterance["end"]:
            word = words[j]
            if word["end"] <= utterance["end"]:
                word["phones"] = []

                while k < len(phones) and phones[k]["end"] <= word["end"]:
                    phone = phones[k]
                    if phone["end"] <= word["end"]:
                        word["phones"].append(phone)
                    k += 1

                utterance["words"].append(word)
            j += 1
        i += 1

    return utterances

In [17]:
def get_dictionary(phone_sequence):
    lines = set()
    for utterance in phone_sequence:
        for word in utterance["words"]:
            phone_seq = [p["text"] for p in word["phones"]]
            lines.add(f"{word['text']}\t{' '.join(phone_seq)}")
    return list(lines)

In [20]:
MAPPING = """
AA0 ɑː
AA1 ˈɑː
AA2 ˌɑː
AE0 æ
AE1 ˈæ
AE2 ˌæ
AH0 ə
AH0 ɐ
AH1 ˈʌ
AH2 ˌʌ
AO0 ɔː
AO1 ˈɔː
AO2 ˌɔː
AW0 aʊ
AW1 ˈaʊ
AW2 ˌaʊ
AY0 aɪ
AY1 ˈaɪ
AY2 ˌaɪ
B b
CH tʃ
D d
DH ð
EH0 ɛ
EH1 ˈɛ
EH2 ˌɛ
ER0 ɚ
ER1 ˈɜː
ER2 ˌɜː
EY0 eɪ
EY1 ˈeɪ
EY2 ˌeɪ
F f
G ɡ
HH h
IH0 ɪ
IH1 ˈɪ
IH2 ˌɪ
IY0 i
IY1 ˈiː
IY2 ˌiː
JH dʒ
K k
L l
M m
N n
NG ŋ
OW0 oʊ
OW1 ˈoʊ
OW2 ˌoʊ
OY0 ɔɪ
OY1 ˈɔɪ
OY2 ˌɔɪ
P p
R ɹ
S s
SH ʃ
T t
TH θ
UH0 ʊ
UH1 ˈʊ
UH2 ˌʊ
UW0 uː
UW1 ˈuː
UW2 ˌuː
V v
W w
Y j
Z z
ZH ʒ
"""

In [63]:
extended = True
if extended:
    MAPPING += """
DX ɾ
"""

In [64]:
cmudict_to_espeak = {}
for line in MAPPING.split("\n"):
    if line == "":
        continue
    line = line.strip()
    parts = line.split(" ")

    if len(parts) != 2:
        print(line)
        continue
    k, v = line.split(" ")
    if not k in cmudict_to_espeak:
        cmudict_to_espeak[k] = v

In [65]:
def espeakify(phlist, sep=""):
    output = []
    if phlist == ["spn"] or phlist == ["sil"]:
        return ""
    for phone in phlist:
        if phone == "":
            continue
        if " " in phone:
            output += [cmudict_to_espeak[x] for x in phone.split(" ")]
        else:
            output.append(cmudict_to_espeak[phone])
    return sep.join(output)

In [66]:
import re

def get_utterances_like_espeak(phone_sequence):
    pairs = {}
    for utterance in phone_sequence:
        words = []
        text = utterance["text"]
        if text.startswith("[") and text.endswith("]"):
            continue
        for word in utterance["words"]:
            phones = [x["text"] for x in word["phones"]]
            phword = espeakify(phones)
            words.append(phword)
        joined = " ".join(words)
        joined = re.sub("  +", " ", joined)
        pairs[text] = joined
    return pairs

In [100]:
ps = get_phone_sequences("/Users/joregan/Playing/textgrids_shared/hsi_7_0719_210_001_main.TextGrid")
# ps = get_phone_sequences("/Users/joregan/Desktop/hsi_7_0719_227_003_inter.TextGrid")
utts = get_utterances_like_espeak(ps)

In [86]:
def run_espeak(text):
    phon = !echo "{text}"| espeak -v en-us --ipa -q
    return (" ".join(phon)).strip()

In [95]:
FIXES = {
    "kˈɑːlɚ": "kˈʌlɚ",
    "dʒɪs": "dʒˈʌs",
    "fɹɚ": "fɚ",
    "fɹə": "fɚ",
}

In [91]:
def cmudict_fixes(text):
    words = text.split(" ")
    out = []
    for word in words:
        if word in FIXES:
            out.append(FIXES[word])
        else:
            out.append(word)
    return " ".join(out)

In [98]:
def assimilations(text):
    text = text.replace("ð ð", " ð")
    text = text.replace("d ð", " ð")
    text = text.replace("d d", " d")
    return text

In [101]:
for utt in utts:
    print(utt)
    print(assimilations(cmudict_fixes(utts[utt])))
    print(run_espeak(utt))
    print()

Okay, yeah, you're not uh in the middle of anything.
əkˈeɪ jˈɛ jˈʊɹ nˈɑːt ˈʌ ɪn ðə mˈɪdəl əv hˈɛniθˌɪŋ
oʊkˈeɪ  jˈɛh  jʊɹ nˌɑːt ˈʌ ɪnðə mˈɪdəl ʌv ˈɛnɪθˌɪŋ

Yeah, I mean, I have my my you know thoughts about it, because I think I didn't really catch the... the essence of the place.
jˈɛ hˈaɪ mˈiːn ˈaɪ hˈæv m mˈaɪ jˈuː nˈoʊ θˈɔːts əbˈaʊt ˈɑː vɪkəz hˈaɪ θˈɪŋk ˈaɪ dˈɪdənt ɹˈiːli kˈætʃ ðə ðə ˈɛsəns hˈʌv ðə plˈeɪs
jˈɛh  aɪ mˈiːn  aɪ hæv maɪ maɪ juː nˈoʊ θˈɔːts ɐbˈaʊt ɪt  bɪkˈʌz aɪ θˈɪŋk aɪ dˈɪdnt ɹˈiəli kˈætʃ ðə  ðɪ ˈɛsəns ʌvðə plˈeɪs

So I I was planning to do some changes, but um but do you have some uh suggestion that you think that I should uh change?
sˈoʊ ˈaɪ ˈaɪ wˈʌz plˈænɪŋ tə dˈuː sˈʌm tʃˈeɪndʒɪz bˈʌt hˈæm bˈʌt d jˈuː hˈæb sˈʌm ˈɑː səɡdʒˈɛstʃən ðət jˈuː θˈɪŋk ðˈæt ˈaɪ ʃˈʊd ˈʌ tʃˈeɪndʒ
sˌoʊ aɪ aɪ wʌz plˈænɪŋ tə dˈuː sˌʌm tʃˈeɪndʒᵻz  bˌʌt ˈʌm bˌʌt dˈuː juː hæv sˌʌm ˈʌ sədʒˈɛstʃən ðæt juː θˈɪŋk ðæt aɪ ʃˌʊd ˈʌ tʃˈeɪndʒ

No, of course I am, I am, but uh but you know, I don't know, I am a li

In [62]:
filestem = "hsi_7_0719_209_003_main"
audio_dir = "/Users/joregan/Playing/hsi/audio/"

with open("/tmp/run_ffmpeg.sh", "w") as outf:
    for item in ps:
        start = item["start"]
        dur = item["end"] - item["start"]
        outf.write(f"ffmpeg -i {audio_dir}{filestem}.wav -acodec pcm_s16le -ac 1 -ar 16000 -ss {start} -t {dur} /tmp/phoninput/{filestem}_{start}_{item['end']}.wav\n")