# Convert NST pronunciation lexicons to JSON

> Converting the pronunciation to IPA along the way

- toc: false
- badges: true
- branch: master
- categories: [nst, swedish, danish, norwegian, pronunciation, icu]

Based on [this]({% post_url 2022-01-12-convert-nst-lexicon %})

> Set up field reading

In [1]:
import csv

In [2]:
!pip install pyicu

Collecting pyicu
  Downloading pyicu-2.15.3.tar.gz (267 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/267.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m266.2/267.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.6/267.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pyicu
  Building wheel for pyicu (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyicu: filename=pyicu-2.15.3-cp312-cp312-linux_x86_64.whl size=2720028 sha256=b231b9f44f3d4ba70ffc531143d89b6982bb5b0cdaf66027e87635f754e73654
  Stored in directory: /root/.cache/pip/wheels/46/11/aa/9777ed706b79bd6fbe41492e7a8dbbf0699e91e0173f7be151
Successfully bui

In [3]:
field_names = [
    "orthography",
    "extended_pos",
    "morphology",
    "decomp",
    "decpos",
    "source",
    "language_code",
    "garbage",
    "domain",
    "abbr_acr",
    "expansion",
    "transliteration1",
    "certainty_trans_1",
    "status_trans_1",
    "language_code_trans_1",
    "transliteration2",
    "certainty_trans_2",
    "status_trans_2",
    "language_code_trans_2",
    "transliteration3",
    "certainty_trans_3",
    "status_trans_3",
    "language_code_trans_3",
    "transliteration4",
    "certainty_trans_4",
    "status_trans_4",
    "language_code_trans_4",
    "auto_gen_variants",
    "set_id",
    "set_name",
    "style_status",
    "inflector_role",
    "lemma",
    "inflection_rule",
    "morph_label",
    "compounder_code",
    "semantic_info",
    "available_field1",
    "available_field2",
    "available_field3",
    "available_field4",
    "available_field5",
    "available_field6",
    "available_field7",
    "available_field8",
    "available_field9",
    "frequency",
    "original_orthography",
    "comment_field",
    "update_info",
    "unique_id"
]

> Get data

1.   [Swedish](https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-22/)
2.   [Danish](https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-26/)
3.   [Norwegian](https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-23/) (Bokmål)



In [4]:
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/sv.leksikon.tar.gz -O /tmp/sv.leksikon.tar.gz
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/da_leksikon.tar.gz -O /tmp/da_leksikon.tar.gz
!wget https://www.nb.no/sbfil/leksikalske_databaser/leksikon/no.leksikon.tar.gz -O /tmp/no.leksikon.tar.gz

--2025-10-24 15:07:18--  https://www.nb.no/sbfil/leksikalske_databaser/leksikon/sv.leksikon.tar.gz
Resolving www.nb.no (www.nb.no)... 158.39.129.53, 2001:700:f01:1071::53
Connecting to www.nb.no (www.nb.no)|158.39.129.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22041470 (21M) [application/octet-stream]
Saving to: ‘/tmp/sv.leksikon.tar.gz’


2025-10-24 15:07:22 (8.60 MB/s) - ‘/tmp/sv.leksikon.tar.gz’ saved [22041470/22041470]

--2025-10-24 15:07:22--  https://www.nb.no/sbfil/leksikalske_databaser/leksikon/da_leksikon.tar.gz
Resolving www.nb.no (www.nb.no)... 158.39.129.53, 2001:700:f01:1071::53
Connecting to www.nb.no (www.nb.no)|158.39.129.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5731447 (5.5M) [application/octet-stream]
Saving to: ‘/tmp/da_leksikon.tar.gz’


2025-10-24 15:07:24 (3.91 MB/s) - ‘/tmp/da_leksikon.tar.gz’ saved [5731447/5731447]

--2025-10-24 15:07:25--  https://www.nb.no/sbfil/leksikalske_databaser/leksi

In [6]:
import tarfile

data = {}

with tarfile.open("/tmp/sv.leksikon.tar.gz") as tar:
    f = tar.extractfile("NST svensk leksikon/swe030224NST.pron/swe030224NST.pron")
    prondata = f.read()
    data["sv"] = prondata.decode('latin1')
with tarfile.open("/tmp/no.leksikon.tar.gz") as tar:
    f = tar.extractfile("NSTs norske leksikon/nor030224NST.pron/nor030224NST.pron")
    prondata = f.read()
    data["no"] = prondata.decode('latin1')
with tarfile.open("/tmp/da_leksikon.tar.gz") as tar:
    f = tar.extractfile("dan030224NST.pron/dan030224NST.pron")
    prondata = f.read()
    data["da"] = prondata.decode('latin1')


> Set up transliterator

In [40]:
TRANSLIT_SV = """
n\` → ɳ ;
s\` → ʂ ;
l\` → ɭ ;
t\` → ʈ ;
d\` → ɖ ;
A → ɑ ;
O → ɔ ;
I → ɪ ;
E \* U → e \u2040 ʊ ;
E → ɛ ;
U → ʊ ;
Y → ʏ ;
2 → ø ;
9 → ø ;
u 0 → ɵ ;
N → ŋ ;
'""' → ² ;
'"' → ˈ ;
\% → ˌ ;
\: → ː ;
\$ → \. ;
g → ɡ ;
s \\\' → ɕ ;
x \\\\ → ɧ ;
\* → \u2040 ;
"""

  n\` → ɳ ;


In [8]:
NST_TRANSLIT = r"""
::XSampa-IPA;

\$ → \. ;

"""

In [9]:
import icu
def transliterator_from_rules(name, rules):
    fromrules = icu.Transliterator.createFromRules(name, rules)
    icu.Transliterator.registerInstance(fromrules)
    return icu.Transliterator.createInstance(name)

In [41]:
swelex_trans = transliterator_from_rules("swelex_trans", TRANSLIT_SV)

In [51]:
nstlex_trans = transliterator_from_rules("nst_trans", NST_TRANSLIT)

In [44]:
assert swelex_trans.transliterate('""bA:n`s`$%ma$man') == "²bɑːɳʂ.ˌma.man"
assert swelex_trans.transliterate('"b9r$mIN$ham') == "ˈbør.mɪŋ.ham"
assert swelex_trans.transliterate('"bI$rU') == "ˈbɪ.rʊ"
assert swelex_trans.transliterate('""bIsp$%go:$d`en') == "²bɪsp.ˌɡoː.ɖen"

assert swelex_trans.transliterate('"x\\A:l') == "ˈɧɑːl"
assert swelex_trans.transliterate("\"s'u:$lens") == "ˈɕuː.lens"
assert swelex_trans.transliterate('a$"lE*U$te$n`a') == 'a.ˈle⁀ʊ.te.ɳa'
assert swelex_trans.transliterate('"fu0l') == 'ˈfɵl'

In [45]:
def collapse_available_fields(data):
    output = []
    for i in range(1, 10):
        if data[f"available_field{i}"] != "":
            output.append(data[f"available_field{i}"])
        del data[f"available_field{i}"]
    data["available_fields"] = output
    return data

In [49]:
def collapse_transliterations(data, transliterator):
    output = []
    for i in range(1, 5):
        if data[f"transliteration{i}"] != "":
            tmp = {}
            tmp["transliteration"] = data[f"transliteration{i}"]
            tmp["ipa"] = transliterator.transliterate(data[f"transliteration{i}"])
            tmp["certainty"] = data[f"certainty_trans_{i}"]
            tmp["status"] = data[f"status_trans_{i}"]
            tmp["language_code"] = data[f"language_code_trans_{i}"]
            output.append(tmp)
        del data[f"transliteration{i}"]
        del data[f"certainty_trans_{i}"]
        del data[f"status_trans_{i}"]
        del data[f"language_code_trans_{i}"]
    data["transliterations"] = output
    return data

In [50]:
import json
import io
with open("svlex.json", "w") as outf:
    swelexf = io.StringIO(data["sv"])
    swelex = csv.DictReader(swelexf, delimiter=';', fieldnames=field_names, quoting=csv.QUOTE_NONE)
    for row in swelex:
        row["decomp"] = [f for f in row["decomp"].split("+") if f != ""]
        row = collapse_available_fields(row)
        row = collapse_transliterations(row, swelex_trans)
        jsonstr = json.dumps(row)
        outf.write(jsonstr + "\n")

In [53]:
for lang in ["da", "no"]:
    with open(f"{lang}lex.json", "w") as outf:
        swelexf = io.StringIO(data[lang])
        swelex = csv.DictReader(swelexf, delimiter=';', fieldnames=field_names, quoting=csv.QUOTE_NONE)
        for row in swelex:
            row["decomp"] = [f for f in row["decomp"].split("+") if f != ""]
            row = collapse_available_fields(row)
            row = collapse_transliterations(row, nstlex_trans)
            jsonstr = json.dumps(row)
            outf.write(jsonstr + "\n")

Error: new-line character seen in unquoted field - do you need to open the file with newline=''?