### Sort training data
This notebook filters out all usages from the first human annotation that cannot be used for model tuning.

### Usage
Set the according paths in the third cell:
- `usages_csv`: Path to the CSV file containing majority votes for each usage
- `usage_embeddings_file`: Path to the json file containing the embeddings for each usage
- `dictionary_file`: Path to the dictionary

In [4]:
import json 
import random
import ast
import pandas as pd
from random import randrange

In [5]:
# generate random seed
seed = randrange(100000)
print(f"seed: {seed}")
random.seed(seed)

seed: 45246


In [6]:
usages_csv = "../data/annotation_results/phase1/english_assigned.csv"
#usages_csv = "../data/annotation_results/phase1/swedish_assigned.csv"

## English
usage_embeddings_file = "../data/outputs/usage_embeddings/english_assigned_usage_embeddings.json"
#usage_embeddings_file = "../data/outputs/usage_embeddings/english_assigned_usage_embeddings[SUB].json"

## Swedish
#usage_embeddings_file = "../data/outputs/usage_embeddings/swedish_assigned_usage_embeddings.json"
#usage_embeddings_file = "../data/outputs/usage_embeddings/swedish_assigned_usage_embeddings[SUB].json"

dictionary_file = "../data/dictionaries/wordnet_sense_id.json"
#dictionary_file = "../data/dictionaries/sw_dict_sense_id.json"
output_file = f"../data/outputs/FILTERED_{usage_embeddings_file.split('/')[-1].split('.')[0]}.json"

In [7]:
# load all senses represented in the usages
usages_df = pd.read_csv(usages_csv, sep='\t', index_col=0)
senses = list(set(sum([ast.literal_eval(s) for s in usages_df["sense"].unique().tolist()], [])))

print(len(senses))

318


In [8]:
with open(dictionary_file, "r") as f:
    dictionary = json.load(f)

In [9]:
# remove senses where no sense of the lemma has an example or gloss
for lemma in dictionary:
    examples = []
    glosses = []
    for sense in lemma["entries"]:
        examples.extend(sense["examples"]) # all examples of the lemma
        glosses.append(sense["sense"]) # all glosses of the lemma
    if len(examples) == 0 or len(glosses) == 0:
       for sense in lemma["entries"]:
           if sense["identifier"] in senses:
               senses.remove(sense["identifier"])

print(len(senses))            

241


In [10]:
mandatory_known = []
mandatory_unknown = []
free = []

for lemma in dictionary:
    for sense in lemma["entries"]:
        if sense["identifier"] in senses:
            if sense["sense"] == "" or sense["examples"] == []: # if no gloss is given, the sense has to be unknown
                mandatory_unknown.append(sense["identifier"])

            elif len([s for s in lemma["entries"] if s["sense"] != ""]) > 1 and len([s for s in lemma["entries"] if s["examples"] != []]): # if there are multiple senses with gloss, the sense is free
                free.append(sense["identifier"])
            else:
                mandatory_known.append(sense["identifier"]) # if there are no other senses with examples, the sense is mandatory known

mandatory_known = list(set(mandatory_known))
mandatory_unknown = list(set(mandatory_unknown))
free = list(set(free))

print(f"mandatory known: {len(mandatory_known)}")
print(f"mandatory unknown: {len(mandatory_unknown)}")
print(f"free: {len(free)}")

mandatory known: 38
mandatory unknown: 41
free: 162


In [12]:
results = []
#senses = list(set(senses) - set(removed))
with open(usage_embeddings_file, "r") as f:
    usage = json.load(f)

    for u in usage:
        if any(s in senses for s in u["gold_senses"]):
            
            results.append(u)

print(len(results))

326


In [14]:
with open(output_file, "w") as f:
    json.dump({
        "mandatory_known": mandatory_known, 
        "mandatory_unknown": mandatory_unknown, 
        "free": free,
        "data": results        
        }, f, indent=4, ensure_ascii=False)