In [206]:
import json

with open("/tmp/waxholm_raw_lexicon.json") as lexjson:
    data = json.load(lexjson)

In [207]:
def simplify_stops(text):
    text = text.replace("Kk", "K")
    text = text.replace("Gg", "G")
    text = text.replace("Dd", "D")
    text = text.replace("Tt", "T")
    text = text.replace("Bb", "B")
    text = text.replace("Pp", "P")
    text = text.replace("k", "K")
    text = text.replace("Kl", "kl")
    text = text.replace("g", "G")
    text = text.replace("d", "D")
    text = text.replace("t", "T")
    text = text.replace("b", "B")
    text = text.replace("p", "P")
    text = text.replace("Pa", "pa")
    text = text.replace("P:", "p:")
    return text


In [208]:
def simplify_phoneme(text):
    text = text.replace("+", "")
    text = text.replace("hy", "#")
    return text

In [209]:
def segment_label(label, skip_pause=True):
    phones = []
    i = 0
    while i < len(label):
        start_i = i
        end_i = i
        if label[i:i+2] in ["NG", "E0", "SJ", "TJ", "kl", "sm", "pa", "ha", "öh", "Pa"]:
            phones.append(label[i:i+2])
            i += 2
        elif label[i:i+2] == "p:":
            if not skip_pause:
                phones.append("p:")
            i += 2
        elif label[i:i+1] in ["#", "~"]:
            i += 1
        else:
            if label[i:i+1] in ["'", "`", "\"", ",", "2"]:
                i += 1
                end_i += 1
            if label[i+1:i+2] in [":", "3", "4"]:
                end_i += 1
            phones.append(label[start_i:end_i+1])
            i = end_i + 1
    return phones

In [210]:
def lclem(lower):
    if lower[0] == lower[-1] == "X":
        return lower
    else:
        return lower.lower()


In [211]:
data[0]

{'stem': 'fp2024.1.03',
 'smp': 'fp2024/fp2024.1.03.smp',
 'text': 'tack det är bra',
 'phoneme': "T'AK D'E:T+ 'Ä3R+ BR'A:",
 'labels': "Tt'AKk Dd'E: BbR'A:",
 'labels_original': "Tt'AKk Dd'E: BbR'A:"}

In [218]:
entries = {}
for item in data:
    if not "phoneme" in item:
        continue
    phonemes = simplify_phoneme(item["phoneme"]).split(" ")
    labels = simplify_stops(item["labels"]).split(" ")
    words = [lclem(x) for x in item["text"].split(" ")]

    if len(phonemes) == len(labels) == len(words):
        curword = {}
        for x in zip(words, phonemes, labels):
            if not x[0] in entries:
                entries[x[0]] = {}
            if not x[1] in entries[x[0]]:
                entries[x[0]][x[1]] = {}
            if not x[2] in entries[x[0]][x[1]]:
                entries[x[0]][x[1]][x[2]] = set()
            entries[x[0]][x[1]][x[2]].add(item["stem"])

In [224]:
for a in entries:
    for b in entries[a]:
        for c in entries[a][b]:
            for d in entries[a][b][c]:
                entries[a][b][c] = list(entries[a][b][c])

In [225]:
with open("/tmp/simple-aligned-entries.json", "w") as simplef:
    json.dump(entries, simplef)