### Make the mapping synset --> babelnet!

In [1]:
import csv
import json

synset2babel = {}
with open("babelnet/sk2bn_id.tsv", "r") as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
        key, value = row
        synset2babel[value] = key

print(len(synset2babel.keys()))
with open("babelnet/synset2babel.json", "w") as file:
    json.dump(synset2babel, file, indent=4)

212405


### Step I: format the original ALLamended dataset in our format and only add the infos we have.

In [2]:
import json
import xml.etree.ElementTree as ET


gold_key_path = "xml/ALLamended.gold.key.txt"
xml_data_path = "xml/ALLamended.data.xml"
output_json_path = "ALLamended_preprocessed.json"

# Load the gold key file
gold_key_mapping = {}
with open(gold_key_path, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) > 1:
            instance_id, senses = parts[0], parts[1:]
            gold_key_mapping[instance_id] = senses

# Parse the XML data
tree = ET.parse(xml_data_path)
root = tree.getroot()

preprocessed_data = []
for text in root.findall(".//text"):
    for sentence in text.findall(".//sentence"):
        tokens = []
        for elem in sentence:
            if elem.tag == "wf" or elem.tag == "instance":
                tokens.append(elem.text)
        context = " ".join(tokens)
        
        for instance in sentence.findall("instance"):
            instance_id = instance.get("id")
            word = instance.text
            lemma = instance.get("lemma")
            pos = instance.get("pos")
            senses = [ synset2babel[elem] for elem in gold_key_mapping.get(instance_id, []) ] # we already apply the babelnet mapping
            
            entry = {
                "id": instance_id,
                "text": context,
                "word": word,
                "lemma": lemma,
                "pos": pos,
                "gold": senses,
                "gold_definitions" : [],
                "candidates": [],
                "definitions": []
            }
            preprocessed_data.append(entry)

# Save the data to JSON
with open("ALLamended_preprocessed.json", 'w') as outfile:
    json.dump(preprocessed_data, outfile, indent=4)


### Step II: fill "candidates" and "definitions" fields

In [3]:
# Read the JSONL file and build a lookup dictionary for lemma and pos
with open("ALLamended_preprocessed.json", "r") as file:
    all_amended_data = json.load(file)

output_entries = []
with open("babelnet/output.jsonl", "r") as file:
    for line in file:
        entry = json.loads(line)
        output_entries.append(entry)

lookup = {}
for entry in output_entries:
    key = (entry.get("lemma"), entry.get("pos"))
    lookup[key] = [] # we need to initialize each time because in output.jsonl there are duplicates!!!!!
    for elem in entry.get("synsets"):
        lookup[key].append({
            "id": elem.get("id"),
            "main_gloss": elem.get("main_gloss")
        })

# Update the "candidates" and "definitions" fields in ALLamended_preprocessed.json
for item in all_amended_data:
    key = (item.get("lemma"), item.get("pos"))
    matches = lookup.get(key, [])
    assert  matches is not [] # it should never happen
    item["candidates"] = [match["id"] for match in matches]
    item["definitions"] = [match["main_gloss"] for match in matches]

# Save the updated data to a new JSON file
updated_file_path = "ALLamended_preprocessed.json"
with open(updated_file_path, "w") as file:
    json.dump(all_amended_data, file, indent=4)
print(f"Updated file saved to: {updated_file_path}")


Updated file saved to: ALLamended_preprocessed.json


### Step III: add the gold definitions

In [4]:
with open("ALLamended_preprocessed.json", "r") as file:
    all_amended_data = json.load(file)

# fill gold_definitions
for item in all_amended_data:
    gold_definitions = []
    for gold in item["gold"]:
        for i,candidate in enumerate(item["candidates"]):
            if gold == candidate: gold_definitions.append(item["definitions"][i])
    item["gold_definitions"] = gold_definitions

# check if the gold is in the candidates
for item in all_amended_data:
    for gold in item["gold"]:
        assert gold in item["candidates"]

# check if all the candidates have different names
for item in all_amended_data:
    assert len(item["candidates"]) == len(set(item["candidates"]))

updated_file_path = "ALLamended_preprocessed.json"
with open(updated_file_path, "w") as file:
    json.dump(all_amended_data, file, indent=4)
print(f"Updated file saved to: {updated_file_path}")


Updated file saved to: ALLamended_preprocessed.json
