# Swedish pronunciation comparison

> "Between English Wiktionary and phoneme recognition output"

- toc: false
- branch: master
- comments: false
- categories: [swedish, phonemes, wiktionary]

In [1]:
from pathlib import Path

BASE = Path("/Users/joregan/Playing")
TSV_DIR = BASE / "wikipron"/ "data" / "scrape" / "tsv"
BROAD = TSV_DIR / "swe_latn_broad.tsv"
NARROW = TSV_DIR / "swe_latn_narrow.tsv"

In [5]:
def clean_pron(word):
    word = word.replace("²", "")
    word = word.replace("¹", "")
    word = word.replace("‿", "")
    return word

In [16]:
def read_tsv(filename):
    wordlist = {}
    with open(filename) as infile:
        for line in infile.readlines():
            line = line.strip()
            if "\t" in line:
                word, pron = line.split("\t")
            else:
                parts = line.split()
                word = line[0]
                pron = " ".join(line[1:])
            if not word in wordlist:
                wordlist[word] = []
            wordlist[word].append(clean_pron(pron.replace(" ", "")))
    return wordlist

In [17]:
broad = read_tsv(str(BROAD))
narrow = read_tsv(str(NARROW))


In [21]:
def check_exact(word, pron):
    isbroad = False
    isnarrow = False
    mbroad = ""
    mnarrow = ""
    if word in broad:
        for pp in broad[word]:
            if pron == pp:
                isbroad = True
                mbroad = pp
    if word in narrow:
        for pp in narrow[word]:
            if pron == pp:
                isnarrow = True
                mnarrow = pp
    outword = ""
    label = ""
    if isbroad and isnarrow:
        if mbroad == mnarrow:
            outword = mbroad
            label = "broad"
        else:
            outword = f"/{mbroad}/ [{mnarrow}]"
            label = "both"
    elif isbroad:
        outword = mbroad
        label = "broad"
    elif isnarrow:
        outword = mnarrow
        label = "narrow"
    return outword, label

with open("/tmp/all-sort-uniq") as uniq:
    for line in uniq.readlines():
        count, word, pron = line.strip().split()
        word = word.lower()
        pron, label = check_exact(word, pron)
        if pron != "":
            print(f"{count}\t{word}\t{pron}\t{label}")
        

2651	å	oː	broad
23	å	ɔ	broad
3	ä	ɛ	broad
69	ä	ɛː	broad
1	åda	oːda	broad
1	adhd	ɑːdeːhoːdeː	broad
54	advokat	advʊkɑːt	broad
360	affär	afæːr	narrow
1	affär	afɛːr	broad
14	afton	aftɔn	broad
887	äga	ɛːɡa	broad
2	agna	aŋna	broad
408	agneta	aŋneːta	broad
1	aktuell	aktɵɛl	broad
1016	akut	akʉːt	broad
170	ål	oːl	broad
95	al	ɑːl	broad
110	åland	oːland	broad
6	albert	albæʈ	broad
17	albin	albɪn	broad
3	alert	alæʈ	broad
4	alfabet	alfabeːt	broad
236	alkohol	alkʊhoːl	broad
78632	alla	ala	broad
49	allihop	alɪhuːp	broad
143	allihopa	alɪhuːpa	broad
19	alltmer	altmeːr	broad
1961	alltså	altsɔ	broad
2	allvarlig	alvɑːrlɪɡ	broad
6	älska	ɛlska	broad
7	älta	ɛlta	broad
1	aluminium	alɵmiːnɪɵm	broad
1	älv	ɛlv	broad
1	älva	ɛlva	broad
461	amanda	amanda	broad
40	amerika	ameːrɪka	broad
20	amma	ama	broad
118	ana	ɑːna	broad
5	and	and	broad
53	ända	ɛnda	broad
537	ändå	ɛndoː	broad
6	ändå	ɛndɔ	broad
4	andlig	andlɪɡ	broad
68478	andra	andra	broad
7	andré	andreː	broad
282	ange	anjeː	broad
6	anka	aŋka	broad
2276	anna	ana	broa