### Model prediction
This notebook calculates the similarity between a word usage and all eligible senses.
It then predicts whether the word usage is represented by a sense based on a threshold.

### Usage
Adjust the variables in cell 2:
Set the file paths to the sampled word usage file from the `sample_data.ipynb` notebook and the sense embeddings of the desired model.
Set the threshold to the tuned value and the `sim_measure` variable to either `cosine` or `spearmanr`.
Set the spaCy model to the desired language model.

In [None]:
import numpy as np
import pandas as pd
import itertools
import ast
import json
import sys
import spacy
import re
import scipy

sys.path.insert(1, '../xl-lexeme/WordTransformer')

from InputExample import InputExample
from WordTransformer import WordTransformer

model = WordTransformer('pierluigic/xl-lexeme') # load model

In [None]:
# set file paths
sentence_file = "../data/outputs/annotation_phase_2/SAMPLE_eng_news_2020_1M-sentences[150000].csv"
model_embeddings = "../data/outputs/sense_embeddings/english/wordnet_sense_id_gloss_[3]_embeddings.json"

threshold = 0.411
sim_measure = "cosine"

dictionary_file = "../data/dictionaries/wordnet_sense_id.json"
nlp = spacy.load("en_core_web_sm")
#nlp = spacy.load("sv_core_news_sm")

# output file
corpus_type = sentence_file.split('/')[-1].split('[')[0]
model_name = model_embeddings.split('/')[-1].split('.')[0]
output_file = f"../data/outputs/model_predictions/{corpus_type}-{model_name}_predictions.csv"

In [None]:
# load sentence sample
df = pd.read_csv(sentence_file, sep="\t").sample(100000, random_state=42)
df["sentence"] = df["sentence"].apply(lambda x: re.sub(r"\s+", " ", x).strip()) # remove multiple whitespaces
lemmas = list(set(list(itertools.chain.from_iterable([ast.literal_eval(l) for l in df["lemmas"]])))) # get unique lemmas
print("Number of sentences: ", len(df))
print("Number of lemmas: ", len(lemmas))
display(df.head(3))
print(lemmas[:10])

In [None]:
# load embeddings
with open(model_embeddings, "r") as f:
    embeddings = json.load(f)
    possible_lemmas = list(set(lemmas).intersection(set(embeddings.keys())))

print("possible lemmas ", len(possible_lemmas))
print(possible_lemmas[:10])

In [None]:
# remove all lemmas from the lists that are not possible to predict
df['lemmas'] = df['lemmas'].apply(lambda x: [l for l in ast.literal_eval(x) if l in possible_lemmas])
df = df[df["lemmas"].apply(lambda x: len(x) > 0)]
print("Number of sentences after removing lemmas not in embeddings: ", len(df))
display(df.head(3))

In [None]:
def similarity(a, b):
    if sim_measure == "cosine":
        return scipy.spatial.distance.cosine(a, b)
    elif sim_measure == "spearmanr":
        return scipy.stats.spearmanr(a, b)[0]

In [None]:
def get_similarities(lemma, embedding):
    senses = [s for s in embeddings[lemma] if s["usages"] != []] 
    closest_similarity = 0
    closest_sense = None
    closest_pos = None

    for sense in senses:
        similarity = similarity(sense["embedding"], embedding)
        if similarity > closest_similarity:
            closest_sense = sense["sense"] 
            closest_similarity = similarity
   
    closest_pos = [t.pos_ for t in nlp(lemma)][0]
    return (closest_sense, closest_similarity, closest_pos) # 1 if unknown, 0 if known

In [None]:
high_sense_lemmas = []
with open(dictionary_file, "r") as f:
    dict_sense_id = json.load(f)

    for lemma in dict_sense_id:
        if len([s for s in lemma["entries"]]) > 10:
            high_sense_lemmas.append(lemma["key"])

print("Number of lemmas with more than 10 senses: ", len(high_sense_lemmas))
display(high_sense_lemmas[:3])
print(lemma["key"])

In [None]:
# pre fill the unassigned usages dict with all lemmas and senses
all_lemmas = list(set(list(itertools.chain.from_iterable([l for l in df["lemmas"]]))))
print(len(all_lemmas))
print(all_lemmas[:10])
unassigned_usages = {}

for lemma in dict_sense_id:
    if lemma["key"] in all_lemmas:

        if lemma["key"] in high_sense_lemmas:
            continue
        senses = [{"sense:": s["identifier"], "definition": s["sense"]} for s in lemma["entries"]]
        unassigned_usages[lemma["key"]] = {"senses": senses, "usages": []}

display(unassigned_usages)

In [None]:
instances = df[["sentence", "lemmas"]].apply(tuple, axis=1)
#display(instances.head(3))
inst = sum([len(l) for s, l in instances])
print("Number of instances: ", inst)
i = 0


for sentence, lemmas in instances: # for all sentence lemma pairs
    doc = nlp(sentence) 
    lemmatized = [(l.lemma_, l.text) for l in doc]
    for lemma in lemmas:
        if lemma in high_sense_lemmas: # skip lemmas with more than 10 senses
            continue
        if lemma in unassigned_usages.keys() and len(unassigned_usages[lemma]["usages"]) > 500: # only look at 500 usages per lemma
            continue
        if lemma == ' ': # skip empty lemmas
            continue

        try:
            token = [l[1] for l in lemmatized if l[0] == lemma][0] # get the first occurence of the lemma
            target = [sentence.index(token), sentence.index(token)+len(token)]
        except:
            continue   

        embedding = model.encode(InputExample(texts=sentence, positions=target))
        sims = get_similarities(lemma, embedding)
        if 0 < sims[1] < threshold: # tuned threshold
            try:
                unassigned_usages[lemma]["usages"].append({
                        "sentence": sentence,
                        "target": target,
                        "closest": (sims[0], sims[1]),
                    })
            except:
                continue

    # print percentage of progress
    if i % 100 == 0:
        print(f"{i/inst*100:.2f}% done", end="\r")
    i += 1

# remove lemmas with no unassigned usages
unassigned_usages = {k: v for k, v in unassigned_usages.items() if len(v["usages"]) > 0}

print("Number of unknown usages: ", len(unassigned_usages))
with open("../data/results/swe_hist_sp_f3_100k_unassigned_prediction.json", "w") as f:
    json.dump(unassigned_usages, f, indent=4, ensure_ascii=False)

In [None]:
with open("../data/results/swe_hist_sp_f3_100k_unassigned_prediction.json", "r") as f:
    unassigned_usages = json.load(f)

In [None]:
complete_covered = []

for lemma in dict_sense_id:
    #complete_examples = [e for e in lemma["entries"] if e["examples"] != []]
    complete_covered.append((lemma['key'], int( not (len([g for g in lemma["entries"] if g["sense"] != ""]) - len(lemma["entries"])) == 0)))
    if lemma['key'] == "avsnitt":
        print(complete_covered[-1])
        print(lemma)
        print([g for g in lemma["entries"] if g["sense"] != ""])

complete_covered = sorted(complete_covered, key=lambda x: x[1])
print(complete_covered[1150:1155])

In [None]:
# sort unknown usages by percentage of completeness
sorted_unassigned_usages = []
for c in complete_covered:
    if c[0] in unassigned_usages.keys():
        sorted_unassigned_usages.append({
            "lemma": c[0],
            "missing_synsets": c[1],
            "unassigned_usages": unassigned_usages[c[0]]
        })
display(sorted_unassigned_usages[6:7])

In [None]:
for u in sorted_unassigned_usages:
    # sort unassigend usages by similarity
    u["unassigned_usages"]["usages"] = sorted(u["unassigned_usages"]["usages"], key=lambda x: x["closest"][1], reverse=True)

In [None]:
with open(output_file, "w") as f:
    json.dump(sorted_unassigned_usages, f, indent=4, ensure_ascii=False)