# Extract phones from TextGrid

> Incomplete, needs work

- branch: master
- badges: false
- hidden: true
- categories: [hsi, textgrid]

The input is the output from MFA, so tiers have specific names.

In [1]:
from praatio import textgrid

def get_phone_sequences(tgfile):
    tg = textgrid.openTextgrid(tgfile, False)

    def get_tier_list(tiername):
        utterances = []
        tier = tg.getTier(tiername)
        for entry in tier.entries:
            text = entry.label.strip()
            if text == "":
                continue
            utterance = {}
            utterance["start"] = entry.start
            utterance["end"] = entry.end
            utterance["text"] = text
            utterances.append(utterance)
        return utterances
    
    utterances = get_tier_list("utterances")
    words = get_tier_list("words")
    phones = get_tier_list("phones")

    i = j = k = 0
    while i < len(utterances):
        utterance = utterances[i]
        utterance["words"] = []

        while j < len(words) and words[j]["end"] <= utterance["end"]:
            word = words[j]
            if word["end"] <= utterance["end"]:
                word["phones"] = []

                while k < len(phones) and phones[k]["end"] <= word["end"]:
                    phone = phones[k]
                    if phone["end"] <= word["end"]:
                        word["phones"].append(phone)
                    k += 1

                utterance["words"].append(word)
            j += 1
        i += 1

    return utterances

In [2]:
def get_dictionary(phone_sequence):
    lines = set()
    for utterance in phone_sequence:
        for word in utterance["words"]:
            phone_seq = [p["text"] for p in word["phones"]]
            lines.add(f"{word['text']}\t{' '.join(phone_seq)}")
    return list(lines)

In [3]:
MAPPING = """
AA0 ɑː
AA1 ˈɑː
AA2 ˌɑː
AE0 æ
AE1 ˈæ
AE2 ˌæ
AH0 ə
AH0 ɐ
AH1 ˈʌ
AH2 ˌʌ
AO0 ɔː
AO1 ˈɔː
AO2 ˌɔː
AW0 aʊ
AW1 ˈaʊ
AW2 ˌaʊ
AY0 aɪ
AY1 ˈaɪ
AY2 ˌaɪ
B b
CH tʃ
D d
DH ð
EH0 ɛ
EH1 ˈɛ
EH2 ˌɛ
ER0 ɚ
ER1 ˈɜː
ER2 ˌɜː
EY0 eɪ
EY1 ˈeɪ
EY2 ˌeɪ
F f
G ɡ
HH h
IH0 ɪ
IH1 ˈɪ
IH2 ˌɪ
IY0 i
IY1 ˈiː
IY2 ˌiː
JH dʒ
K k
L l
M m
N n
NG ŋ
OW0 oʊ
OW1 ˈoʊ
OW2 ˌoʊ
OY0 ɔɪ
OY1 ˈɔɪ
OY2 ˌɔɪ
P p
R ɹ
S s
SH ʃ
T t
TH θ
UH0 ʊ
UH1 ˈʊ
UH2 ˌʊ
UW0 uː
UW1 ˈuː
UW2 ˌuː
V v
W w
Y j
Z z
ZH ʒ
"""

In [4]:
extended = True
if extended:
    MAPPING += """
DX ɾ
"""

In [5]:
cmudict_to_espeak = {}
for line in MAPPING.split("\n"):
    if line == "":
        continue
    line = line.strip()
    parts = line.split(" ")

    if len(parts) != 2:
        print(line)
        continue
    k, v = line.split(" ")
    if not k in cmudict_to_espeak:
        cmudict_to_espeak[k] = v

In [6]:
def espeakify(phlist, sep=""):
    output = []
    if phlist == ["spn"] or phlist == ["sil"]:
        return ""
    for phone in phlist:
        if phone == "":
            continue
        if " " in phone:
            output += [cmudict_to_espeak[x] for x in phone.split(" ")]
        else:
            output.append(cmudict_to_espeak[phone])
    return sep.join(output)

In [7]:
import re

def get_utterances_like_espeak(phone_sequence):
    pairs = {}
    for utterance in phone_sequence:
        words = []
        text = utterance["text"]
        if text.startswith("[") and text.endswith("]"):
            continue
        for word in utterance["words"]:
            phones = [x["text"] for x in word["phones"]]
            phword = espeakify(phones)
            words.append(phword)
        joined = " ".join(words)
        joined = re.sub("  +", " ", joined)
        pairs[text] = joined
    return pairs

In [15]:
ps = get_phone_sequences("/Users/joregan/Playing/textgrids_shared/hsi_7_0719_210_002_main.TextGrid")
# ps = get_phone_sequences("/Users/joregan/Desktop/hsi_7_0719_227_003_inter.TextGrid")
utts = get_utterances_like_espeak(ps)

In [18]:
starts_from_text = {x["text"]: x["start"] for x in ps}

In [20]:
ends_from_text = {x["text"]: x["end"] for x in ps}

In [28]:
import json

def get_phonetic_json(filename):
    with open(filename) as inf:
        data = json.load(inf)
    if not "chunks" in data:
        return []
    for chunk in data["chunks"]:
        chunk["text"] = chunk["text"].replace("[UNK]", "ə")
        chunk["start"] = chunk["timestamp"][0]
        chunk["end"] = chunk["timestamp"][1]
        del(chunk["timestamp"])
    return data["chunks"]

In [29]:
get_phonetic_json("/Users/joregan/Playing/hsi_phone_json/hsi_7_0719_210_002_main.json")

[{'text': 'ʃəɪwʃʊɾeɪgoʊtʊtiːpoʊzəbuːɾɪfaɪæmʌwʌʧɪŋjuː',
  'start': 0.02,
  'end': 12.68},
 {'text': 'jəəwəwzaɪzəjuːwɚsθʌmɪŋðɛɝ', 'start': 12.92, 'end': 17.92},
 {'text': 'jɛskɚsɑwəaɪɑ', 'start': 18.06, 'end': 26.02},
 {'text': 'jʌɑm', 'start': 26.8, 'end': 27.26},
 {'text': 'miːɾ̃aɪθəŋkʌjuːkɪŋgiːv', 'start': 27.48, 'end': 28.78},
 {'text': 'jɚsʌm', 'start': 28.82, 'end': 29.2},
 {'text': 'juːkɪŋgəvə', 'start': 29.58, 'end': 30.06},
 {'text': 'səm', 'start': 30.1, 'end': 30.2},
 {'text': 'pɚsənl̩tʌʧəs', 'start': 30.28, 'end': 31.34},
 {'text': 'ɛmənəjuːtoʊl', 'start': 31.94, 'end': 32.66},
 {'text': 'miːbɪfɑɹ', 'start': 32.7, 'end': 33.18},
 {'text': 'də', 'start': 33.22, 'end': 33.36},
 {'text': 'juː', 'start': 33.4, 'end': 33.54},
 {'text': 'ʌwɚən', 'start': 34.02, 'end': 34.74},
 {'text': 'tuːðiːzɪk', 'start': 34.8, 'end': 35.26},
 {'text': 'lætiːk', 'start': 35.3, 'end': 35.7},
 {'text': 's', 'start': 35.76, 'end': 35.78},
 {'text': 'taɪl', 'start': 35.86, 'end': 36.14},
 {'text': 's

In [9]:
def run_espeak(text):
    phon = !echo "{text}"| espeak -v en-us --ipa -q
    return (" ".join(phon)).strip()

In [10]:
FIXES = {
    "kˈɑːlɚ": "kˈʌlɚ",
    "dʒɪs": "dʒˈʌs",
    "fɹɚ": "fɚ",
    "fɹə": "fɚ",
}

In [11]:
def cmudict_fixes(text):
    words = text.split(" ")
    out = []
    for word in words:
        if word in FIXES:
            out.append(FIXES[word])
        else:
            out.append(word)
    return " ".join(out)

In [12]:
def assimilations(text):
    text = text.replace("ð ð", " ð")
    text = text.replace("d ð", " ð")
    text = text.replace("d d", " d")
    return text

In [16]:
for utt in utts:
    print(utt)
    print(assimilations(cmudict_fixes(utts[utt])))
    print(run_espeak(utt))
    print()

Should I go to t-pose?
ʃˈʊd ˈaɪ ɡˈoʊ tə tˈiːpˈoʊz
ʃˌʊd aɪ ɡˌoʊ tə tˈiːpˈoʊz

I am ah watching you, yeah.
ˈaɪ ˈæm ˈʌ wˈɑːtʃɪŋ jˈuː jˈɪ
aɪɐm ˈɑː wˈɑːtʃɪŋ juː  jˈɛh

You were standing there, yes.
jˈuː wɚ stˈɑːndɪŋ ðˈɛɹ jˈɛs
juː wɜː stˈændɪŋ ðˈɛɹ  jˈɛs

Yeah, I mean, I think uh you can give you some, you can give it some personal touches.
jˈæ ˈaɪ mn ˈaɪ θˈɪŋk hˈɛ jˈuː kən ɡˈɪv jˈuː sˈʌm jˈuː kən ɡˈɪv ˈɪt sˈʌm pˈɜːsɪnɪl tˈʌtʃɪz
jˈɛh  aɪ mˈiːn  aɪ θˈɪŋk ˈʌ juː kæn ɡˈɪv juː sˌʌm  juː kæn ɡˈɪv ɪt sˌʌm pˈɜːsənəl tˈʌtʃᵻz

I mean, eh you told me before that you uh were into this eclectic style, so I think the, the carpet gives uh really that vibe, but maybe you need some other items that can give this look.
ˈaɪ mən ˈɛ jˈuː tˈoʊld mˈiː bɪfˈɔːɹ ðət jˈuː hˈɑː wɚ ɪntˈuː ðˈɪs ɪklˈɛktɪk stˈaɪl sˈoʊ ˈaɪ θˈɪŋk ðə ðə kˈɑːɹpət ɡˈɪbz hˈɛ ɹˈiːli ðˈæt bˈaɪv bˈʌt mˈeɪvi jˈuː nˈiːd sˈʌm ˈʌðɚ ˈaɪtəmz ðət kən ɡˈɪv ðɪs lˈʊk
aɪ mˈiːn  ˈeɪ juː tˈoʊld mˌiː bɪfˌoːɹ ðæt juː ˈʌ wɜːɹ ˌɪntʊ ðɪs ɪklˈɛktɪk stˈaɪl  sˌoʊ aɪ θ

In [14]:
filestem = "hsi_7_0719_209_003_main"
audio_dir = "/Users/joregan/Playing/hsi/audio/"

with open("/tmp/run_ffmpeg.sh", "w") as outf:
    for item in ps:
        start = item["start"]
        dur = item["end"] - item["start"]
        outf.write(f"ffmpeg -i {audio_dir}{filestem}.wav -acodec pcm_s16le -ac 1 -ar 16000 -ss {start} -t {dur} /tmp/phoninput/{filestem}_{start}_{item['end']}.wav\n")