# Homonymy Disambiguation (HD) with Large Language Models

## **Creation of the resource** <br> make *test* dataset compatible with LLMantics framework

In [1]:
# new dataset for HD with LLMs
import json
data_path = "data/homonyms/cluster2fine_map.json"
with open(data_path, "r") as json_file:
    d = json.load(json_file)

sense_key2definition = {}
for v in d.values():
    for e in v:
        sense_key2definition[e[0]] = e[1]
        
cluster_sense_key2definition = {}       
for k,v in d.items():
    new_value = [e[1] for e in v]
    cluster_sense_key2definition[k] = new_value

data_path = f"data/homonyms/test.json"
with open(data_path, "r") as json_file:
    gold_data = json.load(json_file)
ris = []
for id_,item in list(gold_data.items()):
    if type(item["instance_ids"]) == dict:
        text = " ".join(item["words"])
        for instance_idx, instance_id in item["instance_ids"].items():
            id_ = instance_id
            word = item["words"][int(instance_idx)]
            pos = item["pos_tags"][int(instance_idx)]
            lemma = item["lemmas"][int(instance_idx)]
            
            gold = item["senses"][instance_idx]
            candidates = item["wn_candidates"][instance_idx]
            gold_definitions = [sense_key2definition[g] for g in gold]
            definitions = [sense_key2definition[c] for c in candidates]
            
            cluster_gold = item["gold_clusters"][instance_idx]
            cluster_candidates = item["candidate_clusters"][instance_idx]
            cluster_gold_definitions = [cluster_sense_key2definition[cg] for cg in cluster_gold]
            cluster_definitions = [cluster_sense_key2definition[cc] for cc in cluster_candidates]
            
            ris.append({"id":id_, "text":text, "word":word, "lemma":lemma, "pos":pos, \
                    "gold":gold, "gold_definitions":gold_definitions, "candidates":candidates, "definitions":definitions, \
                    "cluster_gold":cluster_gold, "cluster_gold_definitions":cluster_gold_definitions, "cluster_candidates":cluster_candidates, "cluster_definitions":cluster_definitions
                    })
    else:
        text = " ".join(item["example_tokens"])
        word = item["example_tokens"][item["instance_ids"][0]]
        pos = item["pos"]
        lemma = item["lemma"]
        
        gold = [item["synset_name"]]
        candidates = item["wn_candidates"]
        gold_definitions = [sense_key2definition[g] for g in gold]
        definitions = [sense_key2definition[c] for c in candidates]
        
        cluster_gold = [item["cluster_name"]]
        cluster_candidates = item["candidate_clusters"]
        cluster_gold_definitions = [cluster_sense_key2definition[cg] for cg in cluster_gold]
        cluster_definitions = [cluster_sense_key2definition[cc] for cc in cluster_candidates]
        
        ris.append({"id":id_, "text":text, "word":word, "lemma":lemma, "pos":pos, \
                "gold":gold, "gold_definitions":gold_definitions, "candidates":candidates, "definitions":definitions, \
                "cluster_gold":cluster_gold, "cluster_gold_definitions":cluster_gold_definitions, "cluster_candidates":cluster_candidates, "cluster_definitions":cluster_definitions
                })

with open(f"data/LLMantics/test.json", 'w') as json_file:
    json.dump(ris, json_file, indent=4)
    
print(len(ris))

17206


**Creation of INSTANCE IDs list for test data subsets**

In [4]:
for test_data in ["test_FGA", "test_HA", "test_HA_p"]:
    data_path = f"data/homonyms/subsets/{test_data}.json"
    with open(data_path, "r") as json_file:
        gold_data = json.load(json_file)
    instance_ids_list = []
    for id_,item in list(gold_data.items()):
        if type(item["instance_ids"]) == dict:
            text = " ".join(item["words"])
            for instance_idx, instance_id in item["instance_ids"].items():
                instance_ids_list.append(instance_id)
        else: 
            instance_ids_list.append(id_)
    with open(f"data/LLMantics/subsets/{test_data}_ids.txt", "w") as file:
        for instance_id in instance_ids_list:
            file.write(instance_id + "\n")

## **Homonyms** vs **LLMantics** <br> test datasets

* Understand the origin of the two datasets.
* Do they intersect and by which amount?
* Do they have the same statistics? (*numbers of instances*, *average sense cardinality*,  *MFS and random baseline scores*)

### **LLMantics** train: *SemCor* | test: *ALL by Raganato et al.*

In [5]:
import json

llmantics_ids = []
data_path = "data/LLMantics/ALL_preprocessed.json"
with open(data_path, "r") as json_file:
    llmantics_dict = json.load(json_file)

for elem in llmantics_dict:
    llmantics_ids.append(elem["id"])

print(f"Total instances of ALL dataset: {len(llmantics_ids)}")
print("(that is the dataset used by LLMantics benchmark)\n")

print(f"The dataset is subdivided in the following way:")
subsets = {"senseval2" : 0, "senseval3" : 0, "semeval2007" : 0, "semeval2013" : 0, "semeval2015" : 0}
for elem in llmantics_ids:
    words = elem.split(".")[0].split("_")
    if "senseval2" in words:
        subsets["senseval2"] += 1
    elif "senseval3" in words:
        subsets["senseval3"] += 1
    elif "semeval2007" in words:
        subsets["semeval2007"] += 1
    elif "semeval2013" in words:
        subsets["semeval2013"] += 1
    elif "semeval2015" in words:
        subsets["semeval2015"] += 1
print(subsets)

Total instances of ALL dataset: 7253
(that is the dataset used by LLMantics benchmark)

The dataset is subdivided in the following way:
{'senseval2': 2282, 'senseval3': 1850, 'semeval2007': 455, 'semeval2013': 1644, 'semeval2015': 1022}


### **Homonyms** train: 253.276 | test: 17.206 | dev: 8.195

In [6]:
import json

print("Data statistics for the dataset used in our homonyms paper:\n")
homonymy_ids = {"train" : [], "test" : [], "dev" : []}
subsets_dict = {"train" : [], "test" : [], "dev" : []}
for split in ["train", "test", "dev"]:

    data_path = f"data/homonyms/{split}.json"
    with open(data_path, "r") as json_file:
        d = json.load(json_file)
        
    subsets = {"semcor" : 0, "senseval2" : 0, "senseval3" : 0, "semeval2007" : 0, "semeval2013" : 0, "semeval2015" : 0, "wn_examples" : 0}
    for k,v in d.items():
        if type(v["instance_ids"]) == dict:
            subset_name = k.split(".")[0].split("_")
            for idx, elem in v["instance_ids"].items():
                if "semcor" in subset_name:
                    subsets["semcor"] += 1
                    homonymy_ids[split].append(f"semcor_{elem}")
                    continue 
                elif "senseval2" in subset_name:
                    subsets["senseval2"] += 1
                elif "senseval3" in subset_name:
                    subsets["senseval3"] += 1
                elif "semeval" in subset_name:
                    subsets["semeval2007"] += 1
                    homonymy_ids[split].append(f"semeval2007.{elem}")
                    continue 
                elif "semeval2013" in subset_name:
                    subsets["semeval2013"] += 1
                elif "semeval2015" in subset_name:
                    subsets["semeval2015"] += 1
                homonymy_ids[split].append(elem)
        else:
            subsets["wn_examples"] += 1
            homonymy_ids[split].append(k)
    subsets_dict[split] = subsets
    tot = 0
    for v in subsets.values(): tot += v
    assert len(homonymy_ids[split]) == tot
    
    print(f"[{split}]")
    print(f"Total instances: {len(homonymy_ids[split])}")
    print(subsets_dict[split])
    print()
    
print("The four subsets that concatenated form the final dataset (train+test+dev):")
ris = {"SemCor" : 0, "WordNet examples" : 0, "ALL_new" : 0, "SemEval-2007" : 0}
for split in ["train", "test", "dev"]:
    for k,v in subsets_dict[split].items():
        if k == "semcor": ris["SemCor"] +=v
        elif k=="wn_examples": ris["WordNet examples"] +=v
        elif k=="senseval2" or k=="senseval3" or k=="semeval2013" or k=="semeval2015": ris["ALL_new"] +=v
        elif k=="semeval2007": ris["SemEval-2007"] +=v
print(ris)

Data statistics for the dataset used in our homonyms paper:

[train]
Total instances: 253276
{'semcor': 201819, 'senseval2': 1372, 'senseval3': 1173, 'semeval2007': 423, 'semeval2013': 999, 'semeval2015': 688, 'wn_examples': 46802}

[test]
Total instances: 17206
{'semcor': 16419, 'senseval2': 184, 'senseval3': 142, 'semeval2007': 32, 'semeval2013': 91, 'semeval2015': 30, 'wn_examples': 308}

[dev]
Total instances: 8195
{'semcor': 7798, 'senseval2': 89, 'senseval3': 60, 'semeval2007': 0, 'semeval2013': 62, 'semeval2015': 27, 'wn_examples': 159}

The four subsets that concatenated form the final dataset (train+test+dev):
{'SemCor': 226036, 'WordNet examples': 47269, 'ALL_new': 4917, 'SemEval-2007': 455}


> To sum up: we have $ALL_{new}$ dataset that has $4917$ instances (without including *SemEval-2007* that has $455$). $ALL$ dataset ($7253$ instances) has been reduced by $1881$ items (we simply acknowledge that). <br> $7253(-455) - 4917 = 1881$

#### **HA (Homonymy Ambiguous)**<br> test subset with only instances that have more than one candidate homonymy cluster

**Generate *test_HA.json***

In [7]:
import json
data_path = "data/homonyms/test.json"
with open(data_path, "r") as json_file:
    gold_data = json.load(json_file)

test_HA = {}
for id_,value in gold_data.items():
    if type(value["instance_ids"]) == dict:
        instance_ids = {}
        senses = {}
        wn_candidates = {}
        gold_clusters = {}
        candidate_clusters = {}
        for instance_idx, instance_id in value["instance_ids"].items():
            if len(value["candidate_clusters"][instance_idx]) == 1: continue
            instance_ids[instance_idx] = value["instance_ids"][instance_idx]
            senses[instance_idx] = value["senses"][instance_idx]
            wn_candidates[instance_idx] = value["wn_candidates"][instance_idx]
            gold_clusters[instance_idx] = value["gold_clusters"][instance_idx]
            candidate_clusters[instance_idx] = value["candidate_clusters"][instance_idx]
        if len(instance_ids.keys()) == 0: continue
        test_HA[id_] = {"instance_ids" : instance_ids, "lemmas" : value["lemmas"], "pos_tags" : value["pos_tags"], \
                        "senses" : senses, "words" : value["words"], "wn_candidates" : wn_candidates, \
                        "gold_clusters" : gold_clusters, "candidate_clusters" : candidate_clusters
                       }
    else:
        if value["candidate_clusters"] == 1: continue
        test_HA[id_] = value
with open("data/homonyms/subsets/test_HA.json", 'w') as json_file:
    json.dump(test_HA, json_file, indent=4)

# number of instances
data_path = "data/homonyms/subsets/test_HA.json"
with open(data_path, "r") as json_file:
    gold_data = json.load(json_file)
ris = 0
for id_,value in gold_data.items():
    if type(value["instance_ids"]) == dict:
        ris+=len(value["instance_ids"].keys())
    else:
        ris+=1
print(f"Test_HA number of instances: {ris}")


Test_HA number of instances: 2265


#### **HA_p** <br> only the instances whose candidate homonymy clusters contain at least one sense that has a vector representation in training set

> This subset is of 609 instances and is well defined in *test_dictionary_lemma.pkl* file. In the following there's the code to specify such dataset! Basically we need to keep only the test instances where each candidate homonyms cluster is present at least once as gold cluster in the training set.

In [8]:
# we first generate a list with all unique (lemma, pos, gold_cluster) tuple present in training set.
# This will help us in checking the presence of a particular candidate test cluster in training set.

import json
train_data_path = f"data/homonyms/train.json"
with open(train_data_path, "r") as json_file:
    d = json.load(json_file)
    
pos2pos = {"v":"VERB", "n":"NOUN", "a":"ADJ", "r":"ADV"}
gold_clusters_list = []
for k,v in d.items():
    if type(v["instance_ids"]) == dict:
        for idx, elem_list in v["gold_clusters"].items():
            for elem in elem_list:
                gold_clusters_list.append((v["lemmas"][int(idx)],v["pos_tags"][int(idx)],elem))
    else:
        gold_clusters_list.append((v["lemma"], pos2pos[v["pos"]], v["cluster_name"]))

gold_clusters_list = list(set(gold_clusters_list))
print(len(gold_clusters_list))
print(gold_clusters_list[:-10])

35286


In [9]:
# now I select the instances that satisfy the condition just described.

import json
test_data_path = f"data/homonyms/test.json"
with open(test_data_path, "r") as json_file:
    d = json.load(json_file)

ris = {}
for k,v in d.items():
    if type(v["instance_ids"]) == dict:
        l = []
        for idx, elem_list in v["candidate_clusters"].items():
            if len(elem_list) == 1: continue
            b = True
            for elem in elem_list:
                if (v["lemmas"][int(idx)],v["pos_tags"][int(idx)],elem) not in gold_clusters_list: b=False
            if b == True: l.append(v["instance_ids"][idx])
        if l!=[]: ris[k] = l
    else:
        if len(v["candidate_clusters"]) == 1: continue
        b=True
        for candidate in v["candidate_clusters"]:
            if (v["lemma"], pos2pos[v["pos"]], candidate) not in gold_clusters_list: b=False
        if b == True: ris[k] = []

In [10]:
import pickle
from collections import Counter
file_path = 'data/homonyms/test_dictionary_lemma.pkl'
with open(file_path, 'rb') as file:
    data_dict = pickle.load(file)
ris_original, lengths = [], []
for k,v in data_dict.items():
    lengths.append(len(v))
    if len(v) == 0:
        ris_original.append(k)
    for elem in v:
        ris_original.append(elem)
print(Counter(lengths))
print(len(ris_original))

data_dict = ris
ris, lengths = [], []
for k,v in data_dict.items():
    lengths.append(len(v))
    if len(v) == 0:
        ris.append(k)
    for elem in v:
        ris.append(elem)
print(Counter(lengths))
print(len(ris))

Counter({1: 456, 0: 82, 2: 28, 3: 5})
609
Counter({1: 464, 0: 87, 2: 28, 3: 5})
622


> There's a mismatch of 13 between *test_dictionary_lemma.pkl* and instances selected by me (we acknowledge that).

**Generate *test_HA_p.json***

In [11]:
import json


file_path = 'data/homonyms/test_dictionary_lemma.pkl'
with open(file_path, 'rb') as file:
    data_dict = pickle.load(file)
        
data_path = "data/homonyms/subsets/test_HA.json"
with open(data_path, "r") as json_file:
    gold_data = json.load(json_file)

test_HA_p = {}
for id_,value in gold_data.items():
    if type(value["instance_ids"]) == dict:
        if id_ not in data_dict.keys(): continue
        instance_ids = {}
        senses = {}
        wn_candidates = {}
        gold_clusters = {}
        candidate_clusters = {}
        instances_to_keep = data_dict[id_]
        for instance_idx, instance_id in value["instance_ids"].items():
            if instance_id not in instances_to_keep: continue
            instance_ids[instance_idx] = value["instance_ids"][instance_idx]
            senses[instance_idx] = value["senses"][instance_idx]
            wn_candidates[instance_idx] = value["wn_candidates"][instance_idx]
            gold_clusters[instance_idx] = value["gold_clusters"][instance_idx]
            candidate_clusters[instance_idx] = value["candidate_clusters"][instance_idx]
        if len(instance_ids.keys()) == 0: continue
        test_HA_p[id_] = {"instance_ids" : instance_ids, "lemmas" : value["lemmas"], "pos_tags" : value["pos_tags"], \
                        "senses" : senses, "words" : value["words"], "wn_candidates" : wn_candidates, \
                        "gold_clusters" : gold_clusters, "candidate_clusters" : candidate_clusters
                       }
    else:
        if id_ not in data_dict.keys(): continue
        test_HA_p[id_] = value
with open("data/homonyms/subsets/test_HA_p.json", 'w') as json_file:
    json.dump(test_HA_p, json_file, indent=4)

# number of instances
data_path = "data/homonyms/subsets/test_HA_p.json"
with open(data_path, "r") as json_file:
    gold_data = json.load(json_file)
ris = 0
for id_,value in gold_data.items():
    if type(value["instance_ids"]) == dict:
        ris+=len(value["instance_ids"].keys())
    else:
        ris+=1
print(f"Test_HA_p number of instances: {ris}")

Test_HA_p number of instances: 609


#### **FGA (Fine Grained Ambiguous)**<br> test subset with only instances that have more than one fine-grained candidate

In [12]:
import json
data_path = "data/homonyms/test.json"
with open(data_path, "r") as json_file:
    gold_data = json.load(json_file)

test_FGA = {}
for id_,value in gold_data.items():
    if type(value["instance_ids"]) == dict:
        instance_ids = {}
        senses = {}
        wn_candidates = {}
        gold_clusters = {}
        candidate_clusters = {}
        for instance_idx, instance_id in value["instance_ids"].items():
            if len(value["wn_candidates"][instance_idx]) == 1: continue
            instance_ids[instance_idx] = value["instance_ids"][instance_idx]
            senses[instance_idx] = value["senses"][instance_idx]
            wn_candidates[instance_idx] = value["wn_candidates"][instance_idx]
            gold_clusters[instance_idx] = value["gold_clusters"][instance_idx]
            candidate_clusters[instance_idx] = value["candidate_clusters"][instance_idx]
        if len(instance_ids.keys()) == 0: continue
        test_FGA[id_] = {"instance_ids" : instance_ids, "lemmas" : value["lemmas"], "pos_tags" : value["pos_tags"], \
                        "senses" : senses, "words" : value["words"], "wn_candidates" : wn_candidates, \
                        "gold_clusters" : gold_clusters, "candidate_clusters" : candidate_clusters
                       }
    else:
        if value["wn_candidates"] == 1: continue
        test_FGA[id_] = value
with open("data/homonyms/subsets/test_FGA.json", 'w') as json_file:
    json.dump(test_FGA, json_file, indent=4)

# number of instances
data_path = "data/homonyms/subsets/test_FGA.json"
with open(data_path, "r") as json_file:
    gold_data = json.load(json_file)
ris = 0
for id_,value in gold_data.items():
    if type(value["instance_ids"]) == dict:
        ris+=len(value["instance_ids"].keys())
    else:
        ris+=1
print(f"Test_FGA number of instances: {ris}")

Test_FGA number of instances: 14172


### **Intersection** between *test* and *ALL*

In [13]:
# the only test instances that intersect with 'ALL_preprocessed.json' are senseval2, senseval3, semeval2007, semeval2013 and semeval2015
# Indeed, a total of 479 instances.

intersect_count = 0
for id_ in llmantics_ids:
    if id_ in homonymy_ids["test"]:
        intersect_count += 1
print(f"Test instances included in ALL: {intersect_count}")

Test instances included in ALL: 479


### **Data Statistics**

#### ALL

In [14]:
############################
# average ambiguity degree #
############################
import json
import numpy as np
with open("data/LLMantics/ALL_preprocessed.json", 'r') as file:
    data = json.load(file)
avd_amb_deg_list = []
for instance in data:
    avd_amb_deg_list.append(len(instance["candidates"]))
print(f"Average fine-grained ambiguity degree: {np.asarray(avd_amb_deg_list).mean()}")

############################
# MFS and RANDOM baselines #
############################
from sklearn.metrics import f1_score
import random
random.seed(99)
with open("data/LLMantics/ALL_preprocessed.json", 'r') as file:
    gold_data = json.load(file)

NUM_INSTANCES = len(gold_data)
true_labels = [1 for _ in range(NUM_INSTANCES)]
random_predicted_labels = [1 for _ in range(NUM_INSTANCES)]
mfs_predicted_labels = [1 for _ in range(NUM_INSTANCES)]
random_correct, random_wrong = 0,0
mfs_correct, mfs_wrong = 0,0
global_idx = 0
for instance_gold in gold_data:
    
    # adds n) before all candidate and gold definitions
    for idx, definition in enumerate(instance_gold["definitions"]):
        for idx_, gold_definition in enumerate(instance_gold["gold_definitions"]):
            if definition == gold_definition:
                instance_gold["gold_definitions"][idx_] = f"{idx}) {instance_gold['gold_definitions'][idx_]}"
    for idx, definition in enumerate(instance_gold["definitions"]):
        instance_gold["definitions"][idx] = f"{idx}) {definition}"
    
    random_index = random.randint(0, len(instance_gold["definitions"])-1)
    random_definition = instance_gold["definitions"][random_index]
    mfs_definition = instance_gold["definitions"][0]
    
    # random
    if random_definition in instance_gold["gold_definitions"]: random_correct += 1
    else: random_predicted_labels[global_idx] = 0; random_wrong += 1
    # mfs
    if mfs_definition in instance_gold["gold_definitions"]: mfs_correct += 1
    else: mfs_predicted_labels[global_idx] = 0; mfs_wrong += 1
    global_idx += 1
    
assert random_correct+random_wrong == NUM_INSTANCES
assert mfs_correct+mfs_wrong == NUM_INSTANCES
random_f1 = f1_score(true_labels, random_predicted_labels, average='micro')
mfs_f1 = f1_score(true_labels, mfs_predicted_labels, average='micro')

print()
print("-----")
print()
print("Total number of instances:", len(gold_data))
print("Number of RANDOM correctly classified instances:", random_correct)
print("Number of RANDOM incorrectly classified instances:", random_wrong)
print("RANDOM F1 Score (average=micro):", random_f1)
print()
print("-----")
print()
print("Number of MFS correctly classified instances:", mfs_correct)
print("Number of MFS incorrectly classified instances:", mfs_wrong)
print("MFS F1 Score (average=micro):", mfs_f1)

Average fine-grained ambiguity degree: 5.876464911071281





-----

Total number of instances: 7253
Number of RANDOM correctly classified instances: 2792
Number of RANDOM incorrectly classified instances: 4461
RANDOM F1 Score (average=micro): 0.38494416103681234

-----

Number of MFS correctly classified instances: 4728
Number of MFS incorrectly classified instances: 2525
MFS F1 Score (average=micro): 0.6518681924720805


#### test

In [15]:
def compute_mfs_and_random_baselines(data_path, num_instances, compute_fine_grained=True, compute_homonyms=True):
    NUM_INSTANCES = num_instances
    if compute_fine_grained:
        ###########################################
        # MFS and RANDOM baselines (fine-grained) #
        ###########################################
        from sklearn.metrics import f1_score
        import random
        random.seed(99)
        with open(data_path, 'r') as file:
            gold_data = json.load(file)

        true_labels = [1 for _ in range(NUM_INSTANCES)]
        random_predicted_labels = [1 for _ in range(NUM_INSTANCES)]
        mfs_predicted_labels = [1 for _ in range(NUM_INSTANCES)]
        random_correct, random_wrong = 0,0
        mfs_correct, mfs_wrong = 0,0
        global_idx = 0
        for id_,value in data.items():
            if type(value["instance_ids"]) == dict:
                for key, candidates_list in value["wn_candidates"].items():
                    random_index = random.randint(0, len(candidates_list)-1)
                    random_definition = candidates_list[random_index]
                    mfs_definition = candidates_list[0]
                    # random
                    if random_definition in value["senses"][key]: random_correct += 1
                    else: random_predicted_labels[global_idx] = 0; random_wrong += 1
                    # mfs
                    if mfs_definition in value["senses"][key]: mfs_correct += 1
                    else: mfs_predicted_labels[global_idx] = 0; mfs_wrong += 1
                    global_idx += 1
            else:
                random_index = random.randint(0, len(value["wn_candidates"])-1)
                random_definition = value["wn_candidates"][random_index]
                mfs_definition = value["wn_candidates"][0]
                # random
                if random_definition == value["synset_name"]: random_correct += 1
                else: random_predicted_labels[global_idx] = 0; random_wrong += 1
                # mfs
                if mfs_definition == value["synset_name"]: mfs_correct += 1
                else: mfs_predicted_labels[global_idx] = 0; mfs_wrong += 1
                global_idx += 1
                
        assert random_correct+random_wrong == NUM_INSTANCES
        assert mfs_correct+mfs_wrong == NUM_INSTANCES
        random_f1 = f1_score(true_labels, random_predicted_labels, average='micro')
        mfs_f1 = f1_score(true_labels, mfs_predicted_labels, average='micro')

        print()
        print("-----")
        print("fine-grained")
        print()
        print("Total number of instances:", len(gold_data))
        print("Number of RANDOM correctly classified instances:", random_correct)
        print("Number of RANDOM incorrectly classified instances:", random_wrong)
        print("RANDOM F1 Score (average=micro):", random_f1)
        print()
        print("Number of MFS correctly classified instances:", mfs_correct)
        print("Number of MFS incorrectly classified instances:", mfs_wrong)
        print("MFS F1 Score (average=micro):", mfs_f1)

    if compute_homonyms:
        #######################################
        # MFS and RANDOM baselines (homonyms) #
        #######################################
        from sklearn.metrics import f1_score
        import random
        random.seed(99)
        with open(data_path, 'r') as file:
            gold_data = json.load(file)

        true_labels = [1 for _ in range(NUM_INSTANCES)]
        random_predicted_labels = [1 for _ in range(NUM_INSTANCES)]
        mfs_predicted_labels = [1 for _ in range(NUM_INSTANCES)]
        random_correct, random_wrong = 0,0
        mfs_correct, mfs_wrong = 0,0
        global_idx = 0
        for id_,value in data.items():
            if type(value["instance_ids"]) == dict:
                for key, candidates_list in value["candidate_clusters"].items():
                    random_index = random.randint(0, len(candidates_list)-1)
                    random_definition = candidates_list[random_index]
                    mfs_definition = candidates_list[0]
                    # random
                    if random_definition in value["gold_clusters"][key]: random_correct += 1
                    else: random_predicted_labels[global_idx] = 0; random_wrong += 1
                    # mfs
                    if mfs_definition in value["gold_clusters"][key]: mfs_correct += 1
                    else: mfs_predicted_labels[global_idx] = 0; mfs_wrong += 1
                    global_idx += 1
            else:
                random_index = random.randint(0, len(value["candidate_clusters"])-1)
                random_definition = value["candidate_clusters"][random_index]
                mfs_definition = value["candidate_clusters"][0]
                # random
                if random_definition == value["cluster_name"]: random_correct += 1
                else: random_predicted_labels[global_idx] = 0; random_wrong += 1
                # mfs
                if mfs_definition == value["cluster_name"]: mfs_correct += 1
                else: mfs_predicted_labels[global_idx] = 0; mfs_wrong += 1
                global_idx += 1
                
        assert random_correct+random_wrong == NUM_INSTANCES
        assert mfs_correct+mfs_wrong == NUM_INSTANCES
        random_f1 = f1_score(true_labels, random_predicted_labels, average='micro')
        mfs_f1 = f1_score(true_labels, mfs_predicted_labels, average='micro')

        print()
        print("-----")
        print("homonyms")
        print()
        print("Total number of instances:", len(gold_data))
        print("Number of RANDOM correctly classified instances:", random_correct)
        print("Number of RANDOM incorrectly classified instances:", random_wrong)
        print("RANDOM F1 Score (average=micro):", random_f1)
        print()
        print("Number of MFS correctly classified instances:", mfs_correct)
        print("Number of MFS incorrectly classified instances:", mfs_wrong)
        print("MFS F1 Score (average=micro):", mfs_f1)

In [16]:
# average ambiguity degree
import json
import numpy as np
with open("data/homonyms/test.json", 'r') as file:
    data = json.load(file)
avg_FG_amb_deg_list, avg_H_amb_deg_list = [], []
for id_,value in data.items():
    if type(value["instance_ids"]) == dict:
        for candidates_list in value["wn_candidates"].values():
            avg_FG_amb_deg_list.append(len(candidates_list))
        for candidates_list in value["candidate_clusters"].values():
            avg_H_amb_deg_list.append(len(candidates_list))
    else:
        avg_FG_amb_deg_list.append(len(value["wn_candidates"]))
        avg_H_amb_deg_list.append(len(value["candidate_clusters"]))
print(f"Average fine-grained ambiguity degree: {np.asarray(avg_FG_amb_deg_list).mean()}")
print(f"Average homonyms ambiguity degree: {np.asarray(avg_H_amb_deg_list).mean()}")

############################
# MFS and RANDOM baselines #
############################
compute_mfs_and_random_baselines("data/homonyms/test.json", num_instances=17206)

Average fine-grained ambiguity degree: 6.054109031733116
Average homonyms ambiguity degree: 1.1806346623270951

-----
fine-grained

Total number of instances: 1800
Number of RANDOM correctly classified instances: 6309
Number of RANDOM incorrectly classified instances: 10897
RANDOM F1 Score (average=micro): 0.36667441590142974

Number of MFS correctly classified instances: 12828
Number of MFS incorrectly classified instances: 4378
MFS F1 Score (average=micro): 0.7455538765546902

-----
homonyms

Total number of instances: 1800
Number of RANDOM correctly classified instances: 15978
Number of RANDOM incorrectly classified instances: 1228
RANDOM F1 Score (average=micro): 0.9286295478321516

Number of MFS correctly classified instances: 16947
Number of MFS incorrectly classified instances: 259
MFS F1 Score (average=micro): 0.984947111472742


#### test_FGA

In [17]:
# average ambiguity degree
import json
import numpy as np
with open("data/homonyms/subsets/test_FGA.json", 'r') as file:
    data = json.load(file)
avg_amb_deg_list = []
for id_,value in data.items():
    if type(value["instance_ids"]) == dict:
        for candidates_list in value["wn_candidates"].values():
            avg_amb_deg_list.append(len(candidates_list))
    else:
        avg_amb_deg_list.append(len(value["wn_candidates"]))
print(f"Average fine-grained ambiguity degree: {np.asarray(avg_amb_deg_list).mean()}")

############################
# MFS and RANDOM baselines #
############################
compute_mfs_and_random_baselines("data/homonyms/subsets/test_FGA.json", num_instances=14172, compute_homonyms=False)

Average fine-grained ambiguity degree: 7.136113463166808

-----
fine-grained

Total number of instances: 1800
Number of RANDOM correctly classified instances: 3247
Number of RANDOM incorrectly classified instances: 10925
RANDOM F1 Score (average=micro): 0.2291137454134914

Number of MFS correctly classified instances: 9794
Number of MFS incorrectly classified instances: 4378
MFS F1 Score (average=micro): 0.6910810047981937


#### test_HA

In [18]:
# average ambiguity degree
import json
import numpy as np
with open("data/homonyms/subsets/test_HA.json", 'r') as file:
    data = json.load(file)
avg_amb_deg_list = []
for id_,value in data.items():
    if type(value["instance_ids"]) == dict:
        for candidates_list in value["candidate_clusters"].values():
            avg_amb_deg_list.append(len(candidates_list))
    else:
        avg_amb_deg_list.append(len(value["candidate_clusters"]))
print(f"Average homonyms ambiguity degree: {np.asarray(avg_amb_deg_list).mean()}")

############################
# MFS and RANDOM baselines #
############################
compute_mfs_and_random_baselines("data/homonyms/subsets/test_HA.json", num_instances=2265, compute_fine_grained=False)

Average homonyms ambiguity degree: 2.3721854304635763

-----
homonyms

Total number of instances: 1800
Number of RANDOM correctly classified instances: 1071
Number of RANDOM incorrectly classified instances: 1194
RANDOM F1 Score (average=micro): 0.4728476821192053

Number of MFS correctly classified instances: 2006
Number of MFS incorrectly classified instances: 259
MFS F1 Score (average=micro): 0.8856512141280353


#### test_HA_p

In [19]:
# average ambiguity degree
import json
import numpy as np
with open("data/homonyms/subsets/test_HA_p.json", 'r') as file:
    data = json.load(file)
avg_amb_deg_list = []
for id_,value in data.items():
    if type(value["instance_ids"]) == dict:
        for candidates_list in value["candidate_clusters"].values():
            avg_amb_deg_list.append(len(candidates_list))
    else:
        avg_amb_deg_list.append(len(value["candidate_clusters"]))
print(f"Average homonyms ambiguity degree: {np.asarray(avg_amb_deg_list).mean()}")

############################
# MFS and RANDOM baselines #
############################
compute_mfs_and_random_baselines("data/homonyms/subsets/test_HA_p.json", num_instances=609, compute_fine_grained=False)

Average homonyms ambiguity degree: 2.0623973727422005

-----
homonyms

Total number of instances: 571
Number of RANDOM correctly classified instances: 295
Number of RANDOM incorrectly classified instances: 314
RANDOM F1 Score (average=micro): 0.48440065681444994

Number of MFS correctly classified instances: 494
Number of MFS incorrectly classified instances: 115
MFS F1 Score (average=micro): 0.8111658456486043
