### Build annotation data
This notebook generates all data needed for the WSBest annotation on PhiTag:
- `senses.tsv` all senses that appear in the sample
- `usages.tsv` all usages that appear in the sample
- `instances.tsv` all possible combinations of senses and usages

The data is generated from the model predictions.

### Usage
Set the correct path to the model predictions

In [1]:
import csv
from spacy.symbols import IS_PUNCT
import json
from nltk.corpus import wordnet as wn
import pandas as pd

In [2]:
model_result_file = "../data/outputs/annotation_phase_2/eng_combined_data.json"
#model_result_file = "../data_sampling_2/data/results/100k_unassigned_prediction_sorted_reevaluated.json"
#model_result_file = "../data_sampling_2/data/annotation_data/sp_f3_150k_unassigned_prediction_sorted_reevaluated[700].json"
#model_result_file = "../data_sampling_2/data/annotation_data/combined_data.json"
#model_result_file = "../data_sampling_2/data/annotation_data/swe_combined_data.json"

dictionary_file = "../data/dictionaries/wordnet_sense_id.json"
#dictionary_file = "../data_analysis/data/dictionaries/sw_dict_sense_id.json"

sub_dir = "/ws_best_en/"

# Output
instances_file = f"../data/outputs/annotation_phase_2/{sub_dir}instances.tsv"
senses_file = f"../data/outputs/annotation_phase_2/{sub_dir}senses.tsv"
usages_file = f"../data/outputs/annotation_phase_2/{sub_dir}usages.tsv"

#instances_file = f"../data_sampling_2/data/annotation_data/swe_{model_result_file.split('/')[-1].split('.')[0]}_instances.tsv"
#senses_file = f"../data_sampling_2/data/annotation_data/swe_{model_result_file.split('/')[-1].split('.')[0]}_senses.tsv"
#usages_file = f"../data_sampling_2/data/annotation_data/swe_{model_result_file.split('/')[-1].split('.')[0]}_usages.tsv"

In [3]:
with open(model_result_file, "r") as f:
    data = json.load(f)

with open(dictionary_file, "r") as f:
    dictionary = json.load(f)

    global_corpus_id_modern = 0
    global_corpus_id_historical = 0

    instances = []
    senses = []
    usages = []

    # sort usages by similarity score
    data = sorted(data, key=lambda k: k['highest_similarity'])
    display(data[0:10])

    for usage in data:

        for entry in dictionary:
            if entry['key'] == usage['lemma']:
                dictionary_entry = entry
                break

        # add senses

        sense_ids = []

        for sense in dictionary_entry['entries']:
            if sense['sense'] != '':
                gloss = sense['sense']
                if gloss == '':
                    gloss = sense['senseSecondary']
                
                sense_ids.append(sense['identifier'])
                
                senses.append({
                    "senseID": sense['identifier'],
                    "definition": gloss,
                    "lemma": usage['lemma']
                })  

        # add usages
        if usage['corpus_id'] == "leipzig_swe_news":
            index = global_corpus_id_modern
            global_corpus_id_modern += 1
        else:
            index = global_corpus_id_historical
            global_corpus_id_historical += 1

        target = f"{usage['target'][0]}:{usage['target'][1]}"

        id = f"combined_sample_{usage['corpus_id']}-{usage['lemma']}-{index}-{target}"

        usages.append({
        "dataID": id,
        "context": usage['sentence'],
        "indices_target_token": target,
        "indices_target_sentence": f"0:{len(usage['sentence'])}",
        "lemma": usage['lemma']
        })

        if len(instances) > 1400:
            break

        # skip if more than 5 usages of that lemma are already in the dataset
        if len([u for u in usages if u['lemma'] == usage['lemma']]) > 3:
            #print(f"Skipping {usage['lemma']}, already in dataset")
            continue        

        for s in sense_ids:
            instances.append({
                "instanceID": f"{id}-{s}",
                "dataIDs": f"{id},{s}",
                "label_set": "0,1",
                "non_label": "-"
            })

[{'lemma': 'qualification',
  'sentence': "Slovan Bratislava's Champions League qualification tie against Faroe Islands side KI Klaksvik has been cancelled after a player from the Slovakian club tested positive for COVID19, European soccer's governing body UEFA said on Saturday.",
  'target': [37, 50],
  'highest_similarity': 0.173049004,
  'corpus_id': 'leipzig_eng_news'},
 {'lemma': 'qualification',
  'sentence': 'Streichs team is currently eighth in the 18team Bundesliga, two points behind Hoffenheim in the last place for European qualification.',
  'target': [120, 133],
  'highest_similarity': 0.1888046389,
  'corpus_id': 'leipzig_eng_news'},
 {'lemma': 'qualification',
  'sentence': 'But they have fallen away badly, winning only five times in the league in 2020 and they are in real danger of missing out on a Champions League qualification spot.',
  'target': [144, 157],
  'highest_similarity': 0.1922517948,
  'corpus_id': 'leipzig_eng_news'},
 {'lemma': 'qualification',
  'sentenc

In [4]:
# correct target
for u in usages:
    target = list(map(int, u["indices_target_token"].split(":")))
    context = u["context"]

    #print(context[target[0]:target[1]])
    if target[1] < len(context):
        while context[target[1]].isalpha():
            target[1] += 1
            if target[1] == len(context):
                break
    
    # print(context[target[0]:target[1]])

    u["indices_target_token"] = f"{target[0]}:{target[1]}"

In [5]:
import ast

print(f"Number of instances: {len(instances)}")
print(f"Number of senses: {len(senses)}")
print(f"Number of usages: {len(usages)}")

print(f"Number of unique lemmas: {len(set([u['lemma'] for u in usages]))}")
# reduce usages to maxium of 5 ranodm per lemma, less if there are less
usages = pd.DataFrame(usages)
usages = usages.groupby("lemma").apply(lambda x: x.sample(min(len(x), 5))).reset_index(drop=True)
print(f"Number of usages after reduction: {len(usages)}")

Number of instances: 495
Number of senses: 960
Number of usages: 424
Number of unique lemmas: 115
Number of usages after reduction: 247


In [6]:
# control for duplicates

# change istanceIDs (split - and move [1] to front)
instances = pd.DataFrame(instances)
instances = instances.drop_duplicates(subset=['instanceID'])
instances['instanceID'] = instances['instanceID'].apply(lambda x: x.split("-")[1] + "-" + x.replace(x.split("-")[1] + '-', ""))
# alphabetically sort instanceIDs
instances = instances.sort_values(by=['instanceID'])
display(instances.head(10))


instances = instances.drop_duplicates(subset=['instanceID'])
instances.to_csv(instances_file, sep='\t', index=False)


senses = pd.DataFrame(senses)
senses = senses.drop_duplicates(subset=['senseID'])
senses.to_csv(senses_file, sep='\t', index=False)

usages = pd.DataFrame(usages)
usages = usages.drop_duplicates(subset=['dataID'])
usages.to_csv(usages_file, sep='\t', index=False)

Unnamed: 0,instanceID,dataIDs,label_set,non_label
296,attend-combined_sample_ccoha1-272-46:52-attend...,"combined_sample_ccoha1-attend-272-46:52,attend...",1,-
297,attend-combined_sample_ccoha1-272-46:52-attend...,"combined_sample_ccoha1-attend-272-46:52,attend...",1,-
298,attend-combined_sample_ccoha1-272-46:52-attend...,"combined_sample_ccoha1-attend-272-46:52,attend...",1,-
299,attend-combined_sample_ccoha1-272-46:52-attend...,"combined_sample_ccoha1-attend-272-46:52,attend...",1,-
418,attend-combined_sample_ccoha1-368-38:44-attend...,"combined_sample_ccoha1-attend-368-38:44,attend...",1,-
419,attend-combined_sample_ccoha1-368-38:44-attend...,"combined_sample_ccoha1-attend-368-38:44,attend...",1,-
420,attend-combined_sample_ccoha1-368-38:44-attend...,"combined_sample_ccoha1-attend-368-38:44,attend...",1,-
421,attend-combined_sample_ccoha1-368-38:44-attend...,"combined_sample_ccoha1-attend-368-38:44,attend...",1,-
423,attend-combined_sample_ccoha1-373-62:68-attend...,"combined_sample_ccoha1-attend-373-62:68,attend...",1,-
424,attend-combined_sample_ccoha1-373-62:68-attend...,"combined_sample_ccoha1-attend-373-62:68,attend...",1,-


In [7]:
print(f"Instances: {len(instances)}")
# remove duplicate instances
instances = instances.drop_duplicates(subset=['instanceID'])
print(f"Instances: {len(instances)}")


Instances: 495
Instances: 495
