In [2]:
import json

with open("/tmp/waxholm_raw_lexicon.json") as lexjson:
    data = json.load(lexjson)

In [4]:
def simplify_stops(text):
    text = text.replace("Kk", "K")
    text = text.replace("Gg", "G")
    text = text.replace("Dd", "D")
    text = text.replace("Tt", "T")
    text = text.replace("Bb", "B")
    text = text.replace("Pp", "P")
    text = text.replace("k", "K")
    text = text.replace("g", "G")
    text = text.replace("d", "D")
    text = text.replace("t", "T")
    text = text.replace("b", "B")
    text = text.replace("p", "P")
    return text


In [7]:
simple_matched_words = {}
simple_matched_sentences = []
rest = []
for item in data:
    if item["phoneme"] == simplify_stops(item["labels"]):
        text_parts = item["text"].split(" ")
        phone_parts = item["phoneme"].split(" ")
        if len(text_parts) == 1 and len(phone_parts) == 1:
            if item["text"] not in simple_matched_words:
                simple_matched_words[item["text"]] = set()
            simple_matched_words[item["text"]].add(item["phoneme"])
        else:
            simple_matched_sentences.append(item)
    else:
        rest.append(item)

In [10]:
simple_matched_words

{'slut': {"SL'U:T"},
 'tack': {"T'AK"},
 'idag': {"ID'A:G"},
 'ikväll': {"IKV'ÄL"},
 'härifrån': {"H'Ä3RIFRÅN"},
 'hej': {"H'EJ"},
 'hotell': {"HOT'EL"},
 'tisdag': {"T'I:SDA"},
 'fredag': {"FR'E:DA"},
 'onsdag': {"'ONSDA"},
 'ingenting': {"INGENT'ING"},
 'söndag': {"S'ÖNDA"},
 'lördag': {"L'Ö32DA"},
 'måndag': {"M'ÅNDA"},
 '5': {"F'EM"}}

In [14]:
not_as_simple = []
for item in simple_matched_sentences:
    text_parts = item["text"].split(" ")
    phone_parts = item["phoneme"].split(" ")
    if len(text_parts) == len(phone_parts):
        for i in range(0, len(text_parts)):
            if text_parts[i] not in simple_matched_words:
                simple_matched_words[text_parts[i]] = set()
            simple_matched_words[text_parts[i]].add(phone_parts[i])
    else:
        not_as_simple.append(item)

In [18]:
simple_matched_words

{'slut': {"SL'U:T"},
 'tack': {"T'AK"},
 'idag': {"ID'A:G"},
 'ikväll': {"IKV'ÄL"},
 'härifrån': {"H'Ä3RIFRÅN"},
 'hej': {"H'EJ"},
 'hotell': {"HOT'EL"},
 'tisdag': {"T'I:SDA"},
 'fredag': {"FR'E:DA"},
 'onsdag': {"'ONSDA"},
 'ingenting': {"INGENT'ING"},
 'söndag': {"S'ÖNDA"},
 'lördag': {"L'Ö32DA"},
 'måndag': {"M'ÅNDA"},
 '5': {"F'EM"},
 'kväll': {"KV'ÄL"},
 'stan': {"ST'A:N"}}

In [20]:
not_as_simple

[]

In [25]:
def simplify_phoneme(text):
    text = text.replace("+", "")
    text = text.replace("hy", "#")
    return text

In [28]:
other_words = []
for item in rest:
    def update_words(word, phone):
        if word not in simple_matched_words:
            simple_matched_words[word] = set()
        simple_matched_words[word].add(phone)

    text_parts = item["text"].split(" ")
    phone_parts = item["phoneme"].split(" ")
    label_parts = item["labels"].split(" ")
    if len(text_parts) == len(phone_parts) == len(label_parts):
        for i in range(0, len(text_parts)):
            simple_label = simplify_stops(label_parts[i])
            simple_phoneme = simplify_phoneme(phone_parts[i])
            if simple_label == simple_phoneme:
                update_words(text_parts[i], simple_phoneme)
            else:
                other_words.append((text_parts[i], simple_phoneme, simple_label))
    

In [35]:
other_words

[('Lediga', 'L"E:D\'IGA', 'L"E:DIGA'),
 ('och', "'Å:", "'ÅK"),
 ('utvilade', '"U:T#V\'I:LADE', '"U:T#V`I:LADE0'),
 ('tittade', 'T"IT\'ADE', 'T"ITADE0'),
 ('de', "D'OM", "D'ÅM"),
 ('föreställningen', 'F"Ö3RE#ST\'ÄLNINGEN', 'F"Ö3RE#ST`ÄLNINGE0N'),
 ('timme', 'T"IM\'E', 'T"IME0'),
 ('stockholm', 'ST"ÅK#\'ÅLM', 'ST"ÅK#`ÅLM'),
 ('Lila', 'L"I:L\'A', "L'I:LA"),
 ('stolar', 'ST"O:L\'AR', 'ST"O:LAR'),
 ('salen', "S'A:LEN", "S'A:LE0N"),
 ('jag', "J'A:G", "J'A:"),
 ('åka', '"Å:K\'A', '"Å:KA'),
 ('jag', "J'A:G", "J'A:"),
 ('letar', 'L"E:T\'AR', 'L"E:TAR'),
 ('efter', "'EFTÄ4R", "'EFTE0R"),
 ('sandhamn', 'S"AND#H\'AMN', 'S"ANDv#H`AMN'),
 ('Lediga', 'L"E:D\'IGA', 'L"E:DIGA'),
 ('och', "'Å:", "'ÅK"),
 ('utvilade', '"U:T#V\'I:LADE', '"U:T#V`I:LADE0'),
 ('tittade', 'T"IT\'ADE', 'T"ITADE0'),
 ('de', "D'OM", "D'ÅM"),
 ('föreställningen', 'F"Ö3RE#ST\'ÄLNINGEN', 'F"Ö3RE#ST`ÄLNINGE0N'),
 ('timme', 'T"IM\'E', 'T"IME0'),
 ('strömkajen', 'STR"ÖM#K\'AJEN', 'STR"ÖM#K`AJE0N'),
 ('Sprakande', 'SPR"A:K\'ANDE', 'SPR

In [103]:
def segment_label(label, skip_pause=True):
    phones = []
    i = 0
    while i < len(label):
        start_i = i
        end_i = i
        if label[i:i+2] in ["NG", "E0", "SJ", "TJ", "kl", "sm", "kl", "pa", "ha", "öh", "Pa"]:
            phones.append(label[i:i+2])
            i += 2
        elif label[i:i+2] == "p:":
            if not skip_pause:
                phones.append("p:")
            i += 2
        elif label[i:i+1] in ["#", "~"]:
            i += 1
        else:
            if label[i:i+1] in ["'", "`", "\"", ",", "2"]:
                i += 1
                end_i += 1
            if label[i+1:i+2] in [":", "3", "4"]:
                end_i += 1
            phones.append(label[start_i:end_i+1])
            i = end_i + 1
    return phones

In [105]:
def lclem(lower):
    if lower[0] == lower[-1] == "X":
        return lower
    else:
        return lower.lower()


In [106]:
with open("/tmp/simple-words.tsv", "w") as simpler:
    for sword in simple_matched_words:
        for item in simple_matched_words[sword]:
            lower = lclem(sword)
            simpler.write(f"{lower}\t{' '.join(segment_label(item))}\n")

In [107]:
with open("/tmp/other-words.tsv", "w") as simpler:
    for triple in other_words:
        simpler.write(f"{lclem(triple[0])}\t{' '.join(segment_label(triple[1]))}\n")
        simpler.write(f"{lclem(triple[0])}\t{' '.join(segment_label(triple[2]))}\n")

In [113]:
dictified = [{"word": x[0], "phonemes": x[1], "labels": x[2]} for x in set(other_words)]
with open("/tmp/other-words.json", "w") as simpler:
    json.dump(dictified, simpler)
    

In [112]:
dictified

[{'word': 'var', 'phonemes': "V'A:R", 'labels': "'A:R"},
 {'word': 'Öppna', 'phonemes': '"ÖPN\'A', 'labels': '"ÖPNA'},
 {'word': 'eftermiddagen',
  'phonemes': '"EFTÄ4R#M\'IDAN',
  'labels': '"EFTE0#M`IDAN'},
 {'word': 'jag', 'phonemes': "J'A:", 'labels': "J'A:G"},
 {'word': 'Sprakande', 'phonemes': 'SPR"A:K\'ANDE', 'labels': 'SPR"A:KANE0'},
 {'word': 'stockholm', 'phonemes': 'ST"ÅK#\'ÅLM', 'labels': 'ST"ÅKP:#H`ÅLM'},
 {'word': 'innan', 'phonemes': '"IN\'AN', 'labels': '"INA'},
 {'word': 'efter', 'phonemes': "'EFTÄ4R", 'labels': "'EFTE0"},
 {'word': 'en', 'phonemes': "'EN", 'labels': "'E"},
 {'word': 'vaxholmsbåt',
  'phonemes': 'V"AKS#HÅLMS#B\'Å:T',
  'labels': 'V"AKS#HÅLMS#B`Å:T'},
 {'word': 'betyder', 'phonemes': "BET'Y:DÄ4R", 'labels': "BET'Y:DE0R"},
 {'word': 'ifrån', 'phonemes': "IFR'Å:N", 'labels': "IFR'Å:"},
 {'word': 'dags', 'phonemes': "D'AGS", 'labels': "2D'AGS"},
 {'word': 'åker', 'phonemes': "'Å:KÄ4R", 'labels': "'Å:KE0"},
 {'word': 'stockholm', 'phonemes': 'ST"ÅK#\'ÅLM', 

In [116]:
split = []
for item in rest:
    current = {}
    current["smp"] = item["smp"]
    current["words"] = item["text"].split(" ")
    current["phonemes"] = simplify_phoneme(item["phoneme"]).split(" ")
    current["labels"] = simplify_stops(item["labels"]).split(" ")
    split.append(current)

In [119]:
def filter_labels_list(labels):
    out = []
    for item in labels:
        if item not in ["Pa", "P:", "sm"]:
            out.append(item)
    return out

In [127]:
def check_filler(words):
    positions = []
    for i in range(0, len(words)):
        word = words[i]
        if len(word) > 1 and word[0] == "X" and word[-1] == "X":
            positions.append(i)
    return positions

In [122]:
sample = {'smp': 'fp2007/fp2007.pr.09.smp', 'words': ['XinandX', 'Sprakande', 'fyrverkeripjäser', 'exploderade', 'över', 'oss'], 'phonemes': ['SPR"A:K\'ANDE', 'F"Y:R#VÄ4RKÄ4RI#PJ\'Ä:SÄ4R', "EKSPLÅD'E:RADE", "'Ö:VÄ4R", "'ÅS"], 'labels': ['Pa', 'P:', 'SPR"A:KANDE0', 'FY:VÄ4RKÄ4R"I:#PJ`Ä:SE0R', "EKSPLÅD'E:RADE0", "'Ö:VR", "'ÅS"]}

In [128]:
sample_words = sample["words"]
check_filler(sample_words)

[0]

In [129]:
def filter_words(wordlist):
    fillers = check_filler(wordlist)
    if fillers == []:
        return wordlist
    output = []
    for i in range(0, len(wordlist)):
        if i not in fillers:
            output.append(wordlist[i])
    return output

['Sprakande', 'fyrverkeripjäser', 'exploderade', 'över', 'oss']

In [137]:
allpaired = []
for item in split:
    if len(item["words"]) != len(item["phonemes"]):
        filtered_words = filter_words(item["words"])
        if len(filtered_words) == len(item["phonemes"]):
            allpaired += [x for x in zip(filtered_words, item["phonemes"])]
        else:
            print("Error", item)
        continue

Error {'smp': 'fp2024/fp2024.5.00.smp', 'words': ['går', 'det', 'någon', 'båt', 'till', 'vegabryggan', 'från', 'vaxholm', 'runt', 'klockan', '1', 'idag'], 'phonemes': ["G'Å:R", "D'E:T", 'N"Å:G\'ÅN', "B'Å:T", "T'IL", 'V"E:GA#BR\'YGAN'], 'labels': ["G'Å:R", "D'E:", 'N"Å:GÅN', "B'Å:T", "T'IL", 'V"E:GA#BR`YGAN', 'P:', "FR'Å:N", 'V"AKS#H`ÅLMv', 'PaP:', "R'UNT", 'KL"ÅKAN', 'P:', "'ET", "ID'A:G"]}
Error {'smp': 'fp2023/fp2023.7.06.smp', 'words': ['tack', 'för', 'mig', 'för', 'den', 'här', 'gången', 'vi', 'ses', 'igen', 'hej', 'då'], 'phonemes': ["T'AK", "IJ'EN", "F'Ö3R", "M'EJ", "F'Ö3R", "D'EN", "H'Ä3R", "G'ÅNGEN", "V'I:", "S'E:S", "IJ'EN", "H'EJ", "D'Å:"], 'labels': ["T'AK", "F'Ö3", "M'EJ", "F'Ö3R", "'EN", "H'Ä3", "G'ÅNGE0N", 'PaP:', "V'I:", "S'E:S", "IJ'EN", 'PaP:', "H'EJ", "D'Å:"]}
Error {'smp': 'fp2023/fp2023.7.04.smp', 'words': ['vill', 'du', 'ge', 'en', 'lista', 'på', 'vandrarhemmen', 'i', 'skärgården'], 'phonemes': ["V'IL", "D'U:", "J'E:", "'EN", 'L"IST\'A', 'XavbrordX', "P'Å:", 'V"AND

In [140]:
with open("/tmp/eyeball.tsv", "w") as pairf:
    for pair in set(allpaired):
        pairf.write(f"{pair[0]}\t{pair[1]}\n")