### Generate WSBest
This notebook generates all data needed for the WSBest annotation on PhiTag:
- `senses.tsv` all senses that appear in the sample
- `usages.tsv` all usages that appear in the sample
- `instances.tsv` all possible combinations of senses and usages

### Usage
In the second cell, the variables `dictionary_file` and `sample_usage_file` need to be set to the respective files.

For `sample_usage_file`, it is intended to use the reduced sample file to create an appropriated sample for the WSBest annotation. The reduced sample file can be created with the notebook `reduce_sample.ipynb`.

Since the notebook creates a lot of files, subfolders are created for eache type of data. For this, the variable `target_dir` needs to be set to the desired target directory.

In [1]:
import csv
import json

In [4]:
dictionary_file = "../data/dictionaries/wordnet_sense_id.json"
#dictionary_file = "../data/dictionaries/sw_dict_sense_id.json"

sample_usage_file = "../data/outputs/REDUCED_SAMPLE_eng_news_2020_1M-sentences.json"
#sample_usage_file = "../data/outputs/REDUCED_SAMPLE_ccoha1.json"

#sample_usage_file = "../data/outputs/REDUCED_SAMPLE_swe_news_2022_1M-sentences.json"
#sample_usage_file = "../data/outputs/REDUCED_SAMPLE_kubhist2a.json"

target_dir = "ws_best_en_modern"
#target_dir = "ws_best_en_historical"
#target_dir = "ws_best_sw_modern"
#target_dir = "ws_best_sw_historical"

In [5]:
# load sample, extract lemmas, usages and config data
with open(sample_usage_file, "r") as f:
    sample_usage = json.load(f)

    seed = sample_usage["seed"]
    sample_size = sample_usage["sample_size"]
    max_senses = sample_usage["max_senses"]

    usages = []
    lemmas = []

    # append each usage to usages list, including dataID, context, indices_target_token, indices_target_sentence and lemma
    for use in sample_usage["sample"]:
        dataID = use["identifier"]
        context = use["sentence"]
        indices_target_token = use["character_index_sentence"]
        indices_target_sentence = f"0:{len(context)}"
        lemma = use["lemma"]

        lemmas.append(lemma)
        usages.append({"dataID": dataID,
                       "context": context,
                       "indices_target_token": indices_target_token,
                       "indices_target_sentence": indices_target_sentence,
                       "lemma": lemma})

In [6]:
# load dictionary and extract senses
with open(dictionary_file, "r") as f:
    dictionary = json.load(f)

    senses = []

    for entry in dictionary:
        # sanity check if headword has not more than max_senses senses
        if entry["key"] in lemmas and not (len(entry["entries"]) > max_senses): 
            for sense in entry["entries"]:
                if sense["sense"] == "": # skip senses with empty definition
                    continue
                senseID = sense["identifier"]
                definition = sense["sense"]
                lemma = entry["key"]

                senses.append({"senseID": senseID,
                               "definition": definition,
                               "lemma": lemma})

In [7]:
# create instances
instances = []

for use in usages:
    dataID = use["dataID"]
    lemma_usage = use["lemma"]

    for sense in senses:
        if sense["lemma"] == lemma_usage:
            senseID = sense["senseID"]

            instanceID = f"{dataID}-{senseID}"
            dataIDs = f"{dataID},{senseID}"
            label_set = '0,1'

            instances.append({"instanceID": instanceID,
                              "dataIDs": dataIDs,
                              "label_set": label_set,
                              "non_label": "-"})

In [8]:
# write usages as single file
with open(f"../data/outputs/annotation_phase_1/{target_dir}/usages.tsv", "w") as f:
    writer = csv.writer(f, delimiter='\t', quotechar='\\')
    header = usages[0].keys()
    writer.writerow(header)

    for usage in usages:
        writer.writerow(usage.values())

In [10]:
# write senses as single file
with open(f"../data/outputs/annotation_phase_1/{target_dir}/senses.tsv", "w") as f:
    writer = csv.writer(f, delimiter='\t')
    header = senses[0].keys()
    writer.writerow(header)

    for sense in senses:
        writer.writerow(sense.values())

In [11]:
# write instances as single file
with open(f"../data/outputs/annotation_phase_1/{target_dir}/instances.tsv", "w") as f:
    writer = csv.writer(f, delimiter='\t')
    header = instances[0].keys()
    writer.writerow(header)

    for instance in instances:
        writer.writerow(instance.values())

In [12]:
# generate config.json
with open(f"../data/outputs/annotation_phase_1/{target_dir}/config.json", "w") as f:
    config = {
        "seed": seed,
        "sample_size": sample_size
    }
    json.dump(config, f, indent=4)