# **WSD** - paper

## Imports

In [1]:
# import stuffs
from data_module import WSD_DataModule
from hyperparameters import Hparams
from train import train_model
from model import WSD_Model
from evaluation import base_evaluation, fine2cluster_evaluation, cluster_filter_evaluation

import torch
import random
from tqdm import tqdm
import numpy as np
import json
import wandb
from dataclasses import asdict
import pytorch_lightning as pl

# to have a better workflow using python notebooks
%load_ext autoreload
%autoreload 2

# setting the seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    _ = pl.seed_everything(seed)
set_seed(99)

  warn(f"Failed to load image Python extension: {e}")
Global seed set to 99


## Look at the data 
[simple checks about data properties]

In [7]:
# TOTAL NUMBER OF SENSES for coarse-grained WSD
hparams = asdict(Hparams()) # instantiate hyperparamters file
d = json.load(open("data/mapping/cluster2fine_map.json", "r"))
all_senses_list = list(d.keys())
print(f"Length of sense inventory for coarse-grained WSD is {len(all_senses_list)}") # with the old data was 2158

Length of sense inventory for coarse-grained WSD is 106553


In [8]:
# Since we are dealing with neural networks we need to encode the sense invectory and simply create a mapping between 
# coarse-grained senses and indices.

# let's build sense2id and id2sense map for coarse-grained senses
sense2id = {}
id2sense = {}

idx=0
for sense in all_senses_list:
    sense2id[sense] = idx
    id2sense[idx] = sense
    idx+=1

sense2id["<UNK>"] = idx
id2sense[idx] = "<UNK>"
    
json.dump(sense2id, open("data/mapping/cluster_sense2id.json", "w"))
json.dump(id2sense, open("data/mapping/cluster_id2sense.json", "w"))

In [9]:
# let's build sense2id and id2sense map for fine-graned senses
d = json.load(open("data/mapping/cluster2fine_map.json", "r"))
all_senses_list = []
for k in d.keys():
    for fine_s in d[k]:
        all_senses_list.append(fine_s[0])
print(f"Length of sense inventory for fine-grained WSD is {len(all_senses_list)}") # with the old data was 4476
# there could be that a fine sense is present in multiple clusters, not only one!
# we need a set wth no duplicates!
all_senses_list = list(set(all_senses_list))
print(f"Length of sense inventory for fine-grained WSD (with no duplicates) is {len(all_senses_list)}")

sense2id = {}
id2sense = {}

idx=0
for sense in all_senses_list:
    sense2id[sense] = idx
    id2sense[idx] = sense
    idx+=1

sense2id["<UNK>"] = idx
id2sense[idx] = "<UNK>"
    
json.dump(sense2id, open("data/mapping/fine_sense2id.json", "w"))
json.dump(id2sense, open("data/mapping/fine_id2sense.json", "w"))

Length of sense inventory for fine-grained WSD is 154440
Length of sense inventory for fine-grained WSD (with no duplicates) is 117659


In [None]:
# Because of some approaches I'll develop later I need
# to build a direct mapping between fine and coarse-grained (we already have the opposite mapping)

# d = json.load(open("data/mapping/cluster2fine_map.json", "r"))
# fine2coarse = {}
# for k in d.keys():
#     for fine_s in d[k]:
#         fine2coarse[list(fine_s.keys())[0]] = k

# json.dump(fine2coarse, open("data/map/fine2coarse.json", "w"))

In [None]:
# let's see how many <UNK> token we generate without any particular type of preprocessing!
# data = WSD_DataModule(hparams)
# data.setup()

# tot_tokens = 0
# tot_unk = 0
# for batch in tqdm(data.train_dataloader()):
#     for input in batch["inputs"]["input_ids"]:
#         for e in input:
#             if e.item() == 0: # we reached <PAD> tokens
#                 break
#             tot_tokens+=1
#             if e.item() == 100: # is the <UNK> token
#                 tot_unk+=1
# print(f"We have a total of {tot_tokens} tokens")
# print(f"with {tot_unk} <UNK> tokens!")

**Playground**

In [2]:
# to see the percentage of 1 cluster candidates (so that the model cannot make wrong predictions)
hparams = asdict(Hparams())
data = WSD_DataModule(hparams)
data.setup()

tot, c = 0, 0
for b in tqdm(data.train_dataloader()):
    for cluster_candidates in b["cluster_candidates"]:
        tot+=1
        if len(cluster_candidates) == 1:
            c+=1
            
print(round((c/tot)*100, 2)) # 97.09% is high!

  0%|          | 0/7064 [00:00<?, ?it/s]Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_token

97.09





In [None]:
# to see the average length of fine candidates
hparams = asdict(Hparams())
data = WSD_DataModule(hparams)
data.setup()

tot, tot_lenght = 0, 0
for b in tqdm(data.train_dataloader()):
    for fine_candidates in b["fine_candidates"]:
        tot+=1
        tot_lenght+=len(fine_candidates)
            
print(round(tot_lenght/tot)) # 6.82

### Preprocessing
[do not necessarily use it]

#### Clean tokens

With respect to the first homework the *cleaning* operations (also due the power of *BERT Tokenizer*) are very basic and not "aggressive".

> 🔸 The function I implemented is "*clean_tokens*" from the *data_module.py* file. Of course, this function is applied to all the dataset splits (*train/val/test*).

#### Filter sentences

Another important step before finishing the preprocessing part, is to filter out the *training* sentences. This is something it has to be done only at training time because the test/val datasets don't have to be touched in this sense. <br> Let's first see which is the histogram of sentences length.

## Training

COARSE

In [None]:
wandb.login() # this is the key to paste each time for login: 65a23b5182ca8ce3eb72530af592cf3bfa19de85

version_name = "coarse"
with wandb.init(entity="lavallone", project="homonyms", name=version_name, mode="online"):
    hparams = asdict(Hparams())

    data = WSD_DataModule(hparams)
    model = WSD_Model(hparams)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    train_model(data, model, experiment_name=version_name, metric_to_monitor="val_loss", mode="min", epochs=100, precision=hparams["precision"])

wandb.finish()

FINE

In [None]:
wandb.login() # this is the key to paste each time for login: 65a23b5182ca8ce3eb72530af592cf3bfa19de85

version_name = "fine"
with wandb.init(entity="lavallone", project="homonyms", name=version_name, mode="online"):
    hparams = asdict(Hparams())

    data = WSD_DataModule(hparams)
    model = WSD_Model(hparams)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    train_model(data, model, experiment_name=version_name, metric_to_monitor="val_loss", mode="min", epochs=100, precision=hparams["precision"])

wandb.finish()

### Hparams tuning
[if needed]

In [None]:
# def training_pipeline(config=None):
#     hparams_tuning = False
#     version_name = "BASELINE"
#     with wandb.init(entity="lavallone", project="NLP", name=version_name, mode="online", config=config):
#         seed = wandb.config.seed if hparams_tuning else 1999
#         set_seed(seed)
#         hparams = asdict(Hparams())
#         # when doing the hparams search, this is how each run we change them to search for the best combinations!
#         if hparams_tuning:
#             hparams["batch_size"] = wandb.config.batch_size
#             hparams["dropout"] = wandb.config.dropout
#             hparams["lr"] = wandb.config.lr
#             hparams["hidden_dim"] = wandb.config.hidden_dim

#         data = WSD_DataModule(hparams)
#         model = WSD_Model(hparams,
#                           json.load(open(hparams["prefix_path"]+"model/files/fine2coarse.json", "r")),
#                           json.load(open(hparams["prefix_path"]+"model/files/coarse_fine_defs_map.json", "r")),
#                           json.load(open(hparams["prefix_path"]+"model/files/fine_id2sense.json", "r")),
#                           json.load(open(hparams["prefix_path"]+"model/files/fine_sense2id.json", "r")),
#                           json.load(open(hparams["prefix_path"]+"model/files/coarse_sense2id.json", "r")),
#                           json.load(open(hparams["prefix_path"]+"model/files/coarse_id2sense.json", "r"))
#                           )
#         device = "cuda" if torch.cuda.is_available() else "cpu"
#         model.to(device)
        
#         train_model(data, model, experiment_name=version_name, patience=5, metric_to_monitor="val_loss", mode="min", epochs=100, precision=hparams["precision"])

In [None]:
# wandb.login() # this is the key to paste each time for login: 65a23b5182ca8ce3eb72530af592cf3bfa19de85

# sweep_config = {'method': 'random',
#                 'metric': {'goal': 'maximize', 'name': 'val_micro_f1', 'target' : 0.89},
#                 'parameters': {
#                                 'batch_size': {'values': [64, 128, 256, 512]},
#                                 'dropout': {'distribution': 'uniform', 'min': 0.3, 'max': 0.5},
#                                 'lr': {'distribution': 'uniform', 'min': 1e-5, 'max': 1e-2},
#                                 'hidden_dim': {'distribution': 'int_uniform', 'min': 200, 'max': 600},
#                             }
#                }

# sweep_id = wandb.sweep(sweep=sweep_config, project="NLP", entity="lavallone")
# wandb.agent(sweep_id, function=training_pipeline, count=20)
# wandb.finish()

## Evaluation

COARSE

In [None]:
best_coarse_ckpt = "checkpoints/coarse.ckpt" 

model = WSD_Model.load_from_checkpoint(best_coarse_ckpt, strict=False, device="cuda" if torch.cuda.is_available() else "cpu")
data = WSD_DataModule(model.hparams)
data.setup()

base_evaluation(model, data)

FINE

In [None]:
best_fine_ckpt = "checkpoints/fine.ckpt" 

model = WSD_Model.load_from_checkpoint(best_fine_ckpt, strict=False, device="cuda" if torch.cuda.is_available() else "cpu")
data = WSD_DataModule(model.hparams)
data.setup()

base_evaluation(model, data)

FINE2CLUSTER

In [None]:
best_fine_ckpt = "checkpoints/fine.ckpt" 

model = WSD_Model.load_from_checkpoint(best_fine_ckpt, strict=False, device="cuda" if torch.cuda.is_available() else "cpu")
data = WSD_DataModule(model.hparams)
data.setup()

# evaluation on homonym clusters using a fine-grained model
fine2cluster_evaluation(model, data)

CLUSTER FILTERING

In [None]:
best_coarse_ckpt = "checkpoints/coarse.ckpt" 
best_fine_ckpt = "checkpoints/fine.ckpt"

coarse_model = WSD_Model.load_from_checkpoint(best_coarse_ckpt, strict=False, device="cuda" if torch.cuda.is_available() else "cpu")
fine_model = WSD_Model.load_from_checkpoint(best_fine_ckpt, strict=False, device="cuda" if torch.cuda.is_available() else "cpu")
data = WSD_DataModule(coarse_model.hparams)
data.setup()

# evaluation on fine senses using a coarse model for filtering out
cluster_filter_evaluation(coarse_model, fine_model, data, oracle_or_not=False)