# Convert Folkets Swedish pronunciations to IPA

> Mapping to soundfiles

- toc: false
- badges: true
- branch: master
- categories: [folkets, swedish, pronunciation, icu]

Based on [this]({% post_url 2022-01-12-convert-nst-lexicon %}) and [this]({% post_url 2024-10-12-folkets %})

In [1]:
!pip install pyicu

Collecting pyicu
  Downloading PyICU-2.14.tar.gz (263 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/263.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m143.4/263.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.9/263.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pyicu
  Building wheel for pyicu (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyicu: filename=PyICU-2.14-cp311-cp311-linux_x86_64.whl size=1825014 sha256=74fea2289ae3636983ec52308d636b2a1333b2ce7bc91667a14f4c6bfa891534
  Stored in directory: /root/.cache/pip/wheels/61/f8/dc/c70316d1bb1cae6cf19f17c673d2d198fadc78a6ee2b05091b
Successfu

> Get data

In [2]:
DICT="https://folkets-lexikon.csc.kth.se/folkets/folkets_sv_en_public.xml"

In [3]:
import requests

req = requests.get(DICT)

In [4]:
assert req.status_code == 200

In [5]:
import xml.etree.ElementTree as ET

tree = ET.fromstring(req.text)

In [6]:
words =  []
for word_elem in tree.findall("word"):
    word = {
        "word": word_elem.attrib["value"],
    }
    for attrib in ["comment", "lang", "class"]:
        if attrib in word_elem.attrib:
            word[attrib] = word_elem.attrib[attrib]
    phon = word_elem.find("phonetic")
    if phon is not None:
        if "soundFile" in phon.attrib:
            word["soundfile"] = phon.attrib["soundFile"]
        word["transcription"] = phon.attrib["value"]
    words.append(word)

In [7]:
CHAR_REPLACE = {
    "à": "0340",
    "é": "0351",
    "Ö": "0326",
    "ö": "0366",
    "Ä": "0304",
    "ä": "0344",
    "Å": "0305",
    "å": "0345",
    "ê": "0352",
    "&#39;": "'",
}

> Set up transliterator

In [36]:
TRANSLIT = """
r \+ n → ɳ ;
r \+ s → ʂ ;
r \+ l → ɭ ;
r \+ t \: → ʈ ;
r \+ t → ʈ ;
r \+ d → ɖ ;

A → ˈ a ;
U → ˈ u ;
Ä \: → ˈ ɛ ;
Ä → ˈ ɛ ;
ä → ɛ ;
Å → ˈ ɔ ;
å → ɔ ;
I → ɪ ;
E \* U → e \u2040 ʊ ;


Y → ʏ ;
2 → ø ;
9 → ø ;
u 0 → ɵ ;
\@ → ŋ ;
'"' → ˈ ;
\% → ˌ ;
\: → ː ;
g → ɡ ;
s \\\' → ɕ ;
\$ → ɧ ;
\* → \u2040 ;
"""

In [9]:
import icu
def transliterator_from_rules(name, rules):
    fromrules = icu.Transliterator.createFromRules(name, rules)
    icu.Transliterator.registerInstance(fromrules)
    return icu.Transliterator.createInstance(name)

In [37]:
swelex_trans = transliterator_from_rules("swelex_trans", TRANSLIT)

In [38]:
assert swelex_trans.transliterate('²vÄ:r+nplik:t') == "²vˈɛɳplikːt"

assert swelex_trans.transliterate('alakAr+t:') == "alakˈaʈ"
swelex_trans.transliterate('²pÅ:vär:kan')


'²pˈɔːvɛrːkan'

In [None]:
def collapse_available_fields(data):
    output = []
    for i in range(1, 10):
        if data[f"available_field{i}"] != "":
            output.append(data[f"available_field{i}"])
        del data[f"available_field{i}"]
    data["available_fields"] = output
    return data

In [None]:
def collapse_transliterations(data):
    output = []
    for i in range(1, 5):
        if data[f"transliteration{i}"] != "":
            tmp = {}
            tmp["transliteration"] = data[f"transliteration{i}"]
            tmp["ipa"] = swelex_trans.transliterate(data[f"transliteration{i}"])
            tmp["certainty"] = data[f"certainty_trans_{i}"]
            tmp["status"] = data[f"status_trans_{i}"]
            tmp["language_code"] = data[f"language_code_trans_{i}"]
            output.append(tmp)
        del data[f"transliteration{i}"]
        del data[f"certainty_trans_{i}"]
        del data[f"status_trans_{i}"]
        del data[f"language_code_trans_{i}"]
    data["transliterations"] = output
    return data

In [None]:
import json
import io
with open("svlex.json", "w") as outf:
    swelexf = io.StringIO(prondata)
    swelex = csv.DictReader(swelexf, delimiter=';', fieldnames=field_names, quoting=csv.QUOTE_NONE)
    for row in swelex:
        row["decomp"] = [f for f in row["decomp"].split("+") if f != ""]
        row = collapse_available_fields(row)
        row = collapse_transliterations(row)
        jsonstr = json.dumps(row)
        outf.write(jsonstr + "\n")