In [1]:
# general imports
import json

#### Dictionary analysis

In [2]:
with open("../data/dictionaries/wordnet_sense_id.json") as f: # load wordnet sense id
    wn_sense_id = json.load(f)

with open("../data/dictionaries/sw_dict_sense_id.json") as f: # load sw sense id
    sw_sense_id = json.load(f)

In [3]:
print("Swedish Dictionary")

headwords = []
for h in sw_sense_id:
    headwords.append(h)
print(f"headwords: {len(headwords)}")

multi_sense_headwords = []
multi_example_senses = []
senses = []
s_gloss = []
gloss = []
s_examples = []
examples = []
for h in headwords:
    senses.extend(s['identifier'] for s in h['entries'])
    if len(h['entries']) > 1:
        multi_sense_headwords.append(h)
    for s in h['entries']:
        if s['sense'] != "":
            s_gloss.append(s['identifier'])
            gloss.append(s['sense'])
        if len(s['examples']) > 1:
            multi_example_senses.append(s['identifier'])
        for e in s['examples']:
            s_examples.append(s['identifier'])
            examples.extend(s['examples'])

senses = list(set(senses))
s_gloss = list(set(s_gloss))
s_examples = list(set(s_examples))

print(f"senses: {len(senses)}")
print(f"avg senses per headword: {len(senses)/len(headwords)}")
print(f"avg senses per headword with multiple senses: {(len(senses) - (len(headwords) - len(multi_sense_headwords)))/len(multi_sense_headwords)}")
print(f"proportion of senses with gloss: {len(s_gloss)/len(senses)}")
print(f"avg length of gloss: {sum([len(g) for g in gloss])/len(gloss)}")
print(f"proportion of senes with examples: {len(s_examples)/len(senses)}")
print(f"avg number of examples per sense: {len(examples)/len(senses)}")
print(f"avg examples per sense with examples: {len(examples)/len(s_examples)}")
print(f"avg length of example: {sum([len(e) for e in examples])/len(examples)}")


Swedish Dictionary
headwords: 41597
senses: 68086
avg senses per headword: 1.6368007308219343
avg senses per headword with multiple senses: 2.9106318522792844
proportion of senses with gloss: 0.7896924477866228
avg length of gloss: 34.28279427901873
proportion of senes with examples: 0.9999559380783127
avg number of examples per sense: 3.366771436124901
avg examples per sense with examples: 3.3669197890809746
avg length of example: 32.44161758932077


In [4]:
print("Wordnet")

headwords = []
gloss_senses = []
example_senes = []
multi_sense_headwords = []
senses = []
glosses = []
examples = []

for lemma in wn_sense_id:
    headwords.append(lemma["key"])
    if len(lemma["entries"]) > 1:
        multi_sense_headwords.append(lemma["key"])
    for s in lemma["entries"]:
        senses.append(s["identifier"])
        if s["sense"] != "":
            gloss_senses.append(s["identifier"])
            glosses.append(s["sense"])
        for e in s["examples"]:
            example_senes.append(s["identifier"])
            examples.append(e)

entries_with_gloss = len(set(gloss_senses))

example_senes = list(set(example_senes))


print(f"headwords: {len(headwords)}")
print(f"avg senses per headword: {len(senses)/len(headwords)}")
print(f"avg senses per headword with multiple senses: {len(senses)/len(multi_sense_headwords)} ({len(multi_sense_headwords)})")
print(f"proportion of senses with gloss: {entries_with_gloss/len(senses)}")
print(f"avg length of gloss: {sum([len(g) for g in glosses])/len(glosses)}")
print(f"proportion of senes with examples: {len(example_senes)} ({len(example_senes)/len(senses)*100 :.2f}%)")
print(f"avg number of examples per sense: {len(examples)/len(senses)}")
print(f"avg number of examples per sense with examples: {len(examples)/len(example_senes)}")
print(f"avg length of example: {sum([len(e) for e in examples])/len(examples)}")


Wordnet
headwords: 86555
avg senses per headword: 1.3593553232048987
avg senses per headword with multiple senses: 7.526322522868291 (15633)
proportion of senses with gloss: 1.0
avg length of gloss: 59.520674151573616
proportion of senes with examples: 32923 (27.98%)
avg number of examples per sense: 0.4108397997603243
avg number of examples per sense with examples: 1.468244084682441
avg length of example: 34.169324975692504
