### Extract context
Extract context from a dictionary for embedding creation. If the context does not contain the targeted headword, different options are available to add it:

### Options
|option|description|example|
|---|---|---|
| 0 | leaves the examples as they are | a poor salary |
| 1 | HEADWORD: CONTEXT | inadequate: a poor salary |
| 2 | CONTEXT (HEADWORD) | a poor salary (inadequate) |
| 3 | CONTEXT, i.e., HEADWORD | a poor salary, i.e. inadequate |
| 4 | replace word | a inadequate salary |

### Usage
Set the desired dictionary file in the second cell of the notebook in the variable `dictionary_file`.
Choose the desired option for context replacement with the variable `transformation`.

### Disclaimer
This notebook was changed for each data type. The current version is for English gloss data only.
The imported helper function `get_indieces` does not work for Swedish context. 
Furthermore, the third code cell extracts the glosses from the dictionary file and was adjusted for the extraction of examples.

In [1]:
import json 
import random
from nltk.corpus import wordnet as wn
from lemminflect import *
from random import randrange

import sys
sys.path.insert(1, '../helper_scripts')

from index_finder import *

In [2]:
dictionary_file = "../data/dictionaries/wordnet_sense_id.json"
#dictionary_file = "../data/dictionaries/sw_dict_sense_id.json"

transformation = 3

output_file = f"../data/outputs/dictionary_context/{dictionary_file.split('/')[-1].split('.')[0]}/gloss[{transformation}].json"

In [3]:
# transforms the context
def transform_context(lemma, context, sense_id):
    lemma = lemma.replace(" ", "_")
    match transformation:
        case 0:
            return context
        case 1:
            return f"{lemma.replace('_', ' ')}: {context}"
        case 2:
            return f"{context} ({lemma.replace('_', ' ')})"
        case 3:
            return f"{context}, i.e., {lemma.replace('_', ' ')}"
        case 4:
            synsets = []
            # get all synsets of the lemma
            for s in wn.synsets(lemma):
                if s.lemmas()[0].name().lower() == lemma:
                    synsets.append(s)

            target_synset = synsets[sense_id].lemma_names() # get headwords of the synsets
            target_synset = set(target_synset) # remove duplicates

            
            for l in target_synset:
                target = get_indieces(context, l)["word"] # search for the headword in the context
                if target != [-1, -1]:
                    word = context[target[0]:target[1]]
                    return context.replace(word, lemma)
                    
            return f"{context} ({lemma.replace('_', ' ')})" # if no headword is found, apply strategy 2

In [4]:
# extract gloss from dictionary entry
with open(dictionary_file) as f:
    dictionary = json.load(f)

    results = {}

    # keep track of statistics
    total_gloss = 0
    gloss_without_target = 0
    gloss_without_target_post_transformation = 0

    for entry in dictionary: # can be run on a subset of the dictionary to test functionality with low runtime "dictionary[:100]"
        lemma = entry["key"]
        results[lemma] = []
        for sense in entry["entries"]:
            sense_id = int(sense["identifier"].split(".")[-1].split('-')[-1]) # last digit of identifier is the position of the sense in the dictionary entry
            gloss = sense["sense"]
            total_gloss += 1
            gloss = transform_context(lemma, gloss, sense_id) # transform the gloss
            try:
                match transformation:
                    case 0: # no transformation
                        target = get_indieces(gloss, lemma)["word"]
                    case 1: # add lemma at the beginning
                        pos1 = 0
                        pos2 = len(lemma.replace("_", " "))
                        target = [pos1, pos2]
                    case 2: # add lemma at the end with brackets
                        pos1 = len(gloss) - len(lemma.replace("_", " ")) - 2 # -2 because of the brackets
                        pos2 = pos1 + len(lemma.replace("_", " ")) 
                        target = [pos1, pos2]
                    case 3: # add lemma at the end with i.e.,
                        pos1 = gloss.index(", i.e.,") + 8 # +8 because of the ', i.e.,'
                        pos2 = pos1 + len(lemma.replace("_", " "))
                        target = [pos1, pos2]
                    case 4: # replace the headword of the sense with the lemma
                        target = get_indieces(gloss, lemma)["word"]
            except ValueError:
                print(f"lemma: {lemma} not found in transformed gloss: {gloss}")
                target = [-1, -1]
            
            results[lemma].append({
                "sense": sense["identifier"],
                "usages":{
                    "usage": gloss,
                    "target": target
                    } 
                })
            
            # print progress
            if len(results) % 100 == 0 and len(results) > 0:
                print(f"{round(100 * len(results) / len(dictionary), 2)}%", end="\r")
            
    print(f"Total gloss: {total_gloss}")
    print(f"Gloss without target: {gloss_without_target}")
    print(f"Gloss without target after transformation: {gloss_without_target_post_transformation}")

Total gloss: 117659
Gloss without target: 0
Gloss without target after transformation: 0


In [5]:
with open(output_file, "w") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)