### Generate gold splits
This notebook divides the filtered human annotation usages into randomly labeled training sets.

### Usage
Set the file paths in the second cell and run the notebook to create a fivefold split of the data.
To create a different split, change the seed in the third cell or rerun the notebook.

In [11]:
import json 
import random
import ast
from itertools import groupby

import numpy as np
from random import randrange

In [12]:
usage_file = "../data/outputs/FILTERED_english_assigned_usage_embeddings.json"
#usage_file = "../data/outputs/FILTERED_english_assigned_usage_embeddings.json"

dictionary_file = "../data/dictionaries/wordnet_sense_id.json"
#dictionary_file = "../data/dictionaries/sw_dict_sense_id.json"

output_file = f"../data/outputs/cross_validation/{usage_file.split('/')[-1].split('.')[0]}_1.json"

number_of_folds = 5

In [13]:
# generate random seed
seed = randrange(100000)
print(f"seed: {seed}")
random.seed(seed)

seed: 88110


In [14]:
with open(usage_file) as f:
    usage_embeddings = json.load(f)

print(f"number of usage embeddings: {len(usage_embeddings['data'])}")

number of usage embeddings: 326


In [15]:
dictionary_lemmas = []
dictionary_senses = []

with open(dictionary_file, "r") as f:
    dictionary = json.load(f)

    for l in dictionary:
        for s in l["entries"]:
            if s["examples"] != []:
                dictionary_senses.append(s["identifier"])
                dictionary_lemmas.append(l["key"])


dictionary_lemmas = list(set(dictionary_lemmas))
dictionary_senses = list(set(dictionary_senses))

In [16]:
mandatory_known = usage_embeddings["mandatory_known"]
print(f"number of mandatory known: {len(mandatory_known)}")
mandatory_unknown = usage_embeddings["mandatory_unknown"]
free_senses = usage_embeddings["free"]

number of mandatory known: 38


In [17]:
# save one sense of each lemma in known_senses
free_senses_by_lemma = sorted(free_senses)
free_senses_by_lemma = [list(g) for k, g in groupby(free_senses_by_lemma, lambda x: x.split(".")[0])]
print(len(free_senses))
for l in free_senses_by_lemma:
    sense = l.pop(random.randrange(len(l)))
    mandatory_known.append(sense)

free_senses_by_lemma = [l for l in free_senses_by_lemma if l != []]

free_senses = []
for l in free_senses_by_lemma:
    free_senses.extend(l)

print(f"known: {len(mandatory_known)}")
print(f"unknown: {len(mandatory_unknown)}")
print(f"free: {len(free_senses)}")

162
known: 141
unknown: 41
free: 59


In [18]:
mandatory_unknown = mandatory_unknown + free_senses

print(f"known: {len(mandatory_known)}")
print(f"unknown: {len(mandatory_unknown)}")

known: 141
unknown: 100


In [19]:
assigned = 0
for usage in usage_embeddings["data"]:
    if all(s in mandatory_unknown for s in usage["gold_senses"]):
        usage["assigned"] = False
    else:
        usage["assigned"] = True
        assigned += 1

print(f"assigned: {assigned}")
print(f"not assigned: {len(usage_embeddings['data']) - assigned}")

assigned: 262
not assigned: 64


In [20]:
random.shuffle(usage_embeddings["data"])
#display(usage_embeddings["data"][:10])
folds = np.array_split(usage_embeddings["data"], number_of_folds)
unknowns = {0 : 0, 1 : 0, 2 : 0, 3 : 0, 4 : 0}

for i in range(number_of_folds):
    for usage in folds[i]:
        usage["fold"] = i
        if usage["assigned"] == False:
            unknowns[i] += 1

for i in range(number_of_folds):
    print(f"fold {i}: {unknowns[i]/len(folds[i]):.8f}")

fold 0: 0.15151515
fold 1: 0.16923077
fold 2: 0.26153846
fold 3: 0.21538462
fold 4: 0.18461538


In [21]:
with open(output_file, "w") as f:

    json.dump({
        "assigned_lemmas": list(set([s.split('.')[0] for s in mandatory_known + mandatory_unknown])),
        "unknown_senses": mandatory_unknown,
        "seed": seed,
        "data": usage_embeddings["data"]
    }, f, indent=4, ensure_ascii=False)