# Convert NST pronunciation lexicons to JSON

> Converting the pronunciation to IPA along the way

- toc: false
- badges: true
- branch: master
- categories: [nst, swedish, danish, norwegian, pronunciation, icu]

Based on [this]({% post_url 2022-01-12-convert-nst-lexicon %})

> Set up field reading

In [59]:
import csv

In [60]:
!pip install pyicu



In [61]:
field_names = [
    "orthography",
    "extended_pos",
    "morphology",
    "decomp",
    "decpos",
    "source",
    "language_code",
    "garbage",
    "domain",
    "abbr_acr",
    "expansion",
    "transliteration1",
    "certainty_trans_1",
    "status_trans_1",
    "language_code_trans_1",
    "transliteration2",
    "certainty_trans_2",
    "status_trans_2",
    "language_code_trans_2",
    "transliteration3",
    "certainty_trans_3",
    "status_trans_3",
    "language_code_trans_3",
    "transliteration4",
    "certainty_trans_4",
    "status_trans_4",
    "language_code_trans_4",
    "auto_gen_variants",
    "set_id",
    "set_name",
    "style_status",
    "inflector_role",
    "lemma",
    "inflection_rule",
    "morph_label",
    "compounder_code",
    "semantic_info",
    "available_field1",
    "available_field2",
    "available_field3",
    "available_field4",
    "available_field5",
    "available_field6",
    "available_field7",
    "available_field8",
    "available_field9",
    "frequency",
    "original_orthography",
    "comment_field",
    "update_info",
    "unique_id"
]

> Get data

1.   [Swedish](https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-22/)
2.   [Danish](https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-26/)
3.   [Norwegian](https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-23/) (Bokmål)



In [63]:
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/sv.leksikon.tar.gz -O /tmp/sv.leksikon.tar.gz
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/da_leksikon.tar.gz -O /tmp/da_leksikon.tar.gz
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/no.leksikon.tar.gz -O /tmp/no.leksikon.tar.gz

--2025-10-24 16:51:27--  https://www.nb.no/sbfil/leksikalske_databaser/leksikon/sv.leksikon.tar.gz
Resolving www.nb.no (www.nb.no)... 158.39.129.53, 2001:700:f01:1071::53
Connecting to www.nb.no (www.nb.no)|158.39.129.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22041470 (21M) [application/octet-stream]
Saving to: ‘/tmp/sv.leksikon.tar.gz’


2025-10-24 16:51:30 (9.38 MB/s) - ‘/tmp/sv.leksikon.tar.gz’ saved [22041470/22041470]

--2025-10-24 16:51:30--  https://www.nb.no/sbfil/leksikalske_databaser/leksikon/da_leksikon.tar.gz
Resolving www.nb.no (www.nb.no)... 158.39.129.53, 2001:700:f01:1071::53
Connecting to www.nb.no (www.nb.no)|158.39.129.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5731447 (5.5M) [application/octet-stream]
Saving to: ‘/tmp/da_leksikon.tar.gz’


2025-10-24 16:51:32 (4.57 MB/s) - ‘/tmp/da_leksikon.tar.gz’ saved [5731447/5731447]

--2025-10-24 16:51:32--  https://www.nb.no/sbfil/leksikalske_databaser/leksi

In [64]:
import tarfile

data = {}

with tarfile.open("/tmp/sv.leksikon.tar.gz") as tar:
    f = tar.extractfile("NST svensk leksikon/swe030224NST.pron/swe030224NST.pron")
    prondata = f.read()
    data["sv"] = prondata.decode('latin1')
with tarfile.open("/tmp/no.leksikon.tar.gz") as tar:
    f = tar.extractfile("NSTs norske leksikon/nor030224NST.pron/nor030224NST.pron")
    prondata = f.read()
    data["no"] = prondata.decode('latin1')
with tarfile.open("/tmp/da_leksikon.tar.gz") as tar:
    f = tar.extractfile("dan030224NST.pron/dan030224NST.pron")
    prondata = f.read()
    data["da"] = prondata.decode('latin1')

> Set up transliterator

In [65]:
TRANSLIT_SV = """
n\` → ɳ ;
s\` → ʂ ;
l\` → ɭ ;
t\` → ʈ ;
d\` → ɖ ;
A → ɑ ;
O → ɔ ;
I → ɪ ;
E \* U → e \u2040 ʊ ;
E → ɛ ;
U → ʊ ;
Y → ʏ ;
2 → ø ;
9 → ø ;
u 0 → ɵ ;
N → ŋ ;
'""' → ² ;
'"' → ˈ ;
\% → ˌ ;
\: → ː ;
\$ → \. ;
g → ɡ ;
s \\\' → ɕ ;
x \\\\ → ɧ ;
\* → \u2040 ;
"""

  n\` → ɳ ;


In [66]:
NST_TRANSLIT = r"""
::XSampa-IPA;

\$ → \. ;
? → ˀ;
\* → \u2040 ;
"""

In [67]:
import icu
def transliterator_from_rules(name, rules):
    fromrules = icu.Transliterator.createFromRules(name, rules)
    icu.Transliterator.registerInstance(fromrules)
    return icu.Transliterator.createInstance(name)

In [68]:
swelex_trans = transliterator_from_rules("swelex_trans", TRANSLIT_SV)

In [69]:
nstlex_trans = transliterator_from_rules("nst_trans", NST_TRANSLIT)

In [44]:
assert swelex_trans.transliterate('""bA:n`s`$%ma$man') == "²bɑːɳʂ.ˌma.man"
assert swelex_trans.transliterate('"b9r$mIN$ham') == "ˈbør.mɪŋ.ham"
assert swelex_trans.transliterate('"bI$rU') == "ˈbɪ.rʊ"
assert swelex_trans.transliterate('""bIsp$%go:$d`en') == "²bɪsp.ˌɡoː.ɖen"

assert swelex_trans.transliterate('"x\\A:l') == "ˈɧɑːl"
assert swelex_trans.transliterate("\"s'u:$lens") == "ˈɕuː.lens"
assert swelex_trans.transliterate('a$"lE*U$te$n`a') == 'a.ˈle⁀ʊ.te.ɳa'
assert swelex_trans.transliterate('"fu0l') == 'ˈfɵl'

In [70]:
def collapse_available_fields(data):
    output = []
    for i in range(1, 10):
        if data[f"available_field{i}"] != "":
            output.append(data[f"available_field{i}"])
        del data[f"available_field{i}"]
    data["available_fields"] = output
    return data

In [71]:
def collapse_transliterations(data, transliterator):
    output = []
    for i in range(1, 5):
        if data[f"transliteration{i}"] != "":
            tmp = {}
            tmp["transliteration"] = data[f"transliteration{i}"]
            tmp["ipa"] = transliterator.transliterate(data[f"transliteration{i}"])
            tmp["certainty"] = data[f"certainty_trans_{i}"]
            tmp["status"] = data[f"status_trans_{i}"]
            tmp["language_code"] = data[f"language_code_trans_{i}"]
            output.append(tmp)
        del data[f"transliteration{i}"]
        del data[f"certainty_trans_{i}"]
        del data[f"status_trans_{i}"]
        del data[f"language_code_trans_{i}"]
    data["transliterations"] = output
    return data

In [56]:
import json
import io
with open("svlex.json", "w") as outf:
    swelexf = io.StringIO(data["sv"])
    swelex = csv.DictReader(swelexf, delimiter=';', fieldnames=field_names, quoting=csv.QUOTE_NONE)
    for row in swelex:
        row["decomp"] = [f for f in row["decomp"].split("+") if f != ""]
        row = collapse_available_fields(row)
        row = collapse_transliterations(row, swelex_trans)
        jsonstr = json.dumps(row)
        outf.write(jsonstr + "\n")

In [58]:
for lang in ["no", "da"]:
    with open(f"{lang}lex.json", "w", newline='') as outf:
        swelexf = io.StringIO(data[lang].strip())
        swelex = csv.DictReader(swelexf, delimiter=';', fieldnames=field_names, quoting=csv.QUOTE_NONE)
        for row in swelex:
            row["decomp"] = [f for f in row["decomp"].split("+") if f != ""]
            row = collapse_available_fields(row)
            row = collapse_transliterations(row, nstlex_trans)
            jsonstr = json.dumps(row)
            outf.write(jsonstr + "\n")

In [74]:
!tar zxvf /tmp/da_leksikon.tar.gz

dan030224NST.pron/
dan030224NST.pron/dan030224NST.pron_inspect.OUT
dan030224NST.pron/inspect_lex.pl
dan030224NST.pron/dan030224NST.pron


In [73]:
!ls /tmp

da_leksikon.tar.gz
dap_multiplexer.687d0263e489.root.log.INFO.20251024-145908.88
dap_multiplexer.INFO
debugger_yff5l2mct
initgoogle_syslog_dir.0
language_service.687d0263e489.root.log.ERROR.20251024-150829.1078
language_service.687d0263e489.root.log.INFO.20251024-150032.455
language_service.687d0263e489.root.log.INFO.20251024-150228.1060
language_service.687d0263e489.root.log.INFO.20251024-150228.1078
language_service.687d0263e489.root.log.INFO.20251024-164934.26685
language_service.687d0263e489.root.log.INFO.20251024-165054.27028
language_service.687d0263e489.root.log.INFO.20251024-165054.27046
language_service.ERROR
language_service.INFO
no.leksikon.tar.gz
pyright-1084-DjcjSoDsooOQ
pyright-26691-aOz2ngT1MmRO
pyright-27055-dxVMrKKXyn7I
pyright-27055-Yazr5YIAzMAT
pyright-462-C1t8ymrk3KDn
python-languageserver-cancellation
sv.leksikon.tar.gz
