#### 0.0 Load libreries

In [1]:
import nltk 
import re
import math
import random
import csv
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
from scipy.stats import spearmanr, pearsonr
from nltk.corpus import semcor



print("Libreries imported successfully ✓")

Libreries imported successfully ✓


#### 0.1 definition of functions for preprocessing

In [2]:
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', str(sentence)).strip()

def get_stopwords():
    stopwords_file = open(f"./resources/utils/stop_words__frakes_baeza-yates.txt", "r")
    stopwords_list = []
    for word in stopwords_file:
        stopwords_list.append(word.replace('\n', ''))
    stopwords_file.close()

    stopwords_file = open(f"./resources/utils/stop_words_1.txt", "r")
    stopwords_list = []
    for word in stopwords_file:
        stopwords_list.append(word.replace('\n', ''))
    stopwords_file.close()

    stopwords_file = open(f"./resources/utils/stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords_file:
        stopwords_list.append(word.replace('\n', ''))
    stopwords_file.close()
    
    stopwords_list = list(set(stopwords_list))

    return stopwords_list
     
def remove_stopwords(sentence: str, stopwords_list: list):
    return [word for word in sentence.split() if word not in stopwords_list]



### Esercitazione su Word Sense Disambiguation: 

#### INPUT : 
l’input per questa esercitazione è costituito da coppie di termini contenute nel file WordSim353 (disponibile nei
formati .tsv e .csv)
- Il file contiene 353 coppie di termini utilizzati come testset in varie competizioni internazionali
- A ciascuna coppia è attribuito un valore numerico [0,10], che rappresenta la similarità fra gli elementi della coppia.

In [3]:
sim_file = f"resources/WordSim353/WordSim353.csv"

pairs = []
with open(sim_file, 'r', encoding = "utf-8") as f:
    reader = csv.reader(f)
    reader.__next__()
    for lines in reader:
        pairs.append(lines)

print("WordSim353 loaded correctly ✓")


WordSim353 loaded correctly ✓


#### Prima parte

Implementare tre misure di similarità basate su WordNet. Per ciascuna delle misure di similarità, calcolare:
- gli indici di correlazione di Spearman e
- gli indici di correlazione di Pearson fra i risultati ottenuti e quelli ‘target’ presenti nel file annotato.

Le misure da implementare sono le seguenti: 
- Wu & Palmer: 
$$cs(s_1, s_2) = \frac{2 \cdot depth(LCS)}{depth(s_1) + depth(s_2)}$$

in cui $LCS$ rappresenta il primo antenato comune (Lowest Common Subsumer) e depth è la funzione che misura la distanza fra la radice di WordNet e il sysnet $x$. 

***L’obiettivo è implementare la misura di similarità di Wu & Palmer! Non dobbiamo fare ciò che viene già fatto nell’implementazione di nltk o altre librerie!***



In [4]:
def get_LCS(synset1: Synset, synset2: Synset) -> Synset:
    """
    Return the lowest common subsumer (LCS) of two synsets.
    
    The LCS is the lowest synset (the one with the maximum depth) that has both synsets as descendants.
    """
    
    lcs = None
    hypernyms1 = []
    hypernyms2 = []

    hypernyms1.append(synset1) 
    hypernyms2.append(synset2) 

    # for each synset extraction of hypernyms to add to the list
    for synset in hypernyms1:
        if synset_hyp := synset.hypernyms():
            hypernyms1.extend(synset_hyp)

    for synset in hypernyms2:
        if synset_hyp := synset.hypernyms():
            hypernyms2.extend(synset_hyp)

    if hypernyms1 and hypernyms2:
        if common_hypernyms := set(hypernyms1).intersection(set(hypernyms2)):
            common_hypernyms = [(syn, syn.max_depth() + 1) for syn in common_hypernyms] 
            common_hypernyms.sort(key=lambda x: x[1], reverse=True)
            lcs = common_hypernyms[0][0] 

    return lcs

def depth(synset: Synset) -> int:
    """
    Return the maximum length of the hypernym paths for a synset.
    """
    depth = 0
    if synset:
        depth = max([len(path) for path in synset.hypernym_paths()])

    return depth

In [5]:
def wu_palmer_similarity(syn1: Synset, syn2: Synset):
    """
    Return the Wu-Palmer similarity between synset.
    Interval (0, 1]
    """
    
    if isinstance(syn1, Synset) and isinstance(syn2, Synset):

        lcs_depth = 0
        
        if lcs := get_LCS(syn1, syn2):
            lcs_depth = lcs.max_depth() + 1
            
        depth1 = depth(syn1) 
        depth2 = depth(syn2) 
        if depth1 > 0 and depth2 > 0:
            cs = (2 * (lcs_depth)) / (depth1 + depth2)

        return cs
    else:
        raise TypeError("The input parameters must be Synset objects")

In [6]:
# TEST
print("wu similarity between boat and computer: ", wu_palmer_similarity(wn.synsets("boat")[0], wn.synsets("computer")[0]))
print("wu similarity between ferry and boat: ", wu_palmer_similarity(wn.synsets("ferry")[0], wn.synsets("boat")[0]))
print("wu similarity between boat and ship: ", wu_palmer_similarity(wn.synsets("boat")[0], wn.synsets("ship")[0]))
print("wu similarity between computer and chip: ", wu_palmer_similarity(wn.synsets("computer")[0], wn.synsets("chip")[0]))
print("wu similarity between computer and microchip: ", wu_palmer_similarity(wn.synsets("computer")[0], wn.synsets("microchip")[0]))


wu similarity between boat and computer:  0.6
wu similarity between ferry and boat:  0.9565217391304348
wu similarity between boat and ship:  0.9090909090909091
wu similarity between computer and chip:  0.26666666666666666
wu similarity between computer and microchip:  0.7368421052631579


- Shortest Path:
$$sim_{path}(s_1, s_2) = 2 \cdot depthMax - len(s_1, s_2)$$

in cui $depthMax$ è un valore fissato per una specifica versione di WordNet. 

In [7]:
depth_max = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

def path_len(synset1: Synset, synset2: Synset):
    """
    Return the length of the shortest path between two synsets.
    
    The path length is defined as the number of edges in the shortest path between the two synsets.
    
    If no path exists between the two synsets, return 0.
    """

    distance = 0
    lcs = get_LCS(synset1, synset2)
    if lcs:
        distance = (depth(synset1) - depth(lcs)) + (depth(synset2) - depth(lcs))

    return distance
    

def shortest_path(synset1: Synset, synset2: Synset):
    """
    Return the shortest path measure between two synsets.
    """
    shortest_path = 0
    if (len := path_len(synset1, synset2)) >= 0:
        shortest_path = 2 * depth_max - len
    
    return shortest_path


In [8]:
# TEST
shortest_path_value = shortest_path(wn.synsets("dog")[0], wn.synsets("computer")[0])
print(shortest_path_value)

25


- Leakcock & Chodorow: 
$$sim_{LC}(s_1, s_2) = - log \frac{len(s_1, s_2)}{2 \cdot depthMax}$$

In [9]:
def leakcock_chodorow(synset1: Synset, synset2: Synset):
    lc = 0
    if (len := path_len(synset1, synset2)) > 0:
        fract = len / (2 * depth_max) 
        lc = - math.log(fract)
    return lc

In [10]:
# TEST
print("Similarity: ", leakcock_chodorow(wn.synsets("car")[0], wn.synsets("boat")[0]))

Similarity:  1.742969305058623


#### Calcolo della similarità

Le funzioni precedentemente implementate richiedono come input sensi e non i termini. Quindi per calcolare la similarità fra 2 termini è necessario prendere la massima similarità fra tutti i sensi del primo termine e tutti i sensi del secondo termine. 

L'ipotesi quindi è che i due termini funzionino come contesto di disambiguazione l'uno per l'altro. 

L'equazione che formalizza questa idea è la seguente: 

$$sim(w_1, w_2) = \max_{c_1 \in s(w_1), c_2 \in s(w_2)} [sim(c_1, c_2)]$$

***NB!*** Per calcolare gli indici di correlazione non si è interessati a entrare nel merito di come sono calcolati, possiamo prendere delle funzioni prefatte e usarle.

In [11]:
def get_similarity() -> tuple[list[float], list[float], list[float], list[float]]:
    wordsimilarities = []
    wu_palmer_similarities = []
    shortpath_similarities = []
    leakcock_chodorow_similarities = []

    for row in pairs:
        print("-------------------------------")
        print(row)
        term1 = row[0]
        term2 = row[1]

        synsets1 = wn.synsets(term1)
        synsets2 = wn.synsets(term2)

        print("\nsynsets1: ", synsets1)
        print("synsets2: ", synsets2)
        
        if (synsets1 != [] and synsets2 != []):

            wu_palmer_max = 0
            short_path_max = 0
            leakcock_chodorow_max = 0

            for s1 in synsets1:
                for s2 in synsets2:

                    # Metrica 1
                    wu_palmer = wu_palmer_similarity(s1, s2)
                    if wu_palmer > wu_palmer_max:
                        wu_palmer_max = wu_palmer

                    #Metrica 2
                    shortpath = shortest_path(s1, s2)
                    if shortpath > short_path_max:
                        short_path_max = shortpath

                    # Metrica 3
                    leacchod = leakcock_chodorow(s1, s2)
                    if leacchod > leakcock_chodorow_max:
                        leakcock_chodorow_max = leacchod
            
            print("Wu palmer: ", wu_palmer_max)
            print("Shortest path: ", short_path_max)
            print("Leakcock & Chodorow: ", leakcock_chodorow_max)
            wu_palmer_similarities.append(float(wu_palmer_max))
            shortpath_similarities.append(float(short_path_max))
            leakcock_chodorow_similarities.append(float(leakcock_chodorow_max))
            
            wordsimilarities.append(float(row[2]))

    return wu_palmer_similarities, shortpath_similarities, leakcock_chodorow_similarities, wordsimilarities

In [12]:
wupalmerres, pathres, lcres, wordsimres = get_similarity()

-------------------------------
['love', 'sex', '6.77']

synsets1:  [Synset('love.n.01'), Synset('love.n.02'), Synset('beloved.n.01'), Synset('love.n.04'), Synset('love.n.05'), Synset('sexual_love.n.02'), Synset('love.v.01'), Synset('love.v.02'), Synset('love.v.03'), Synset('sleep_together.v.01')]
synsets2:  [Synset('sexual_activity.n.01'), Synset('sex.n.02'), Synset('sex.n.03'), Synset('sex.n.04'), Synset('arouse.v.07'), Synset('sex.v.02')]
Wu palmer:  0.9230769230769231
Shortest path:  40
Leakcock & Chodorow:  3.6888794541139363
-------------------------------
['tiger', 'cat', '7.35']

synsets1:  [Synset('tiger.n.01'), Synset('tiger.n.02')]
synsets2:  [Synset('cat.n.01'), Synset('guy.n.01'), Synset('cat.n.03'), Synset('kat.n.01'), Synset('cat-o'-nine-tails.n.01'), Synset('caterpillar.n.02'), Synset('big_cat.n.01'), Synset('computerized_tomography.n.01'), Synset('cat.v.01'), Synset('vomit.v.01')]
Wu palmer:  0.9655172413793104
Shortest path:  40
Leakcock & Chodorow:  3.688879454113936

In [13]:
#Indici di correlazione

print("len(wordsimres): ", len(wordsimres))
print("len(wupalmerres): ", len(wupalmerres))
print("len(pathres): ", len(pathres))
print("len(lcres): ", len(lcres))

#Pearson
print("####################################################################")
print("Pearson ")
print(pearsonr(wordsimres, wupalmerres))
print(pearsonr(wordsimres, pathres))
print(pearsonr(wordsimres, lcres))

#Spearman
print("Spearman ")
print(spearmanr(wordsimres, wupalmerres))
print(spearmanr(wordsimres, pathres))
print(spearmanr(wordsimres, lcres))

len(wordsimres):  352
len(wupalmerres):  352
len(pathres):  352
len(lcres):  352
####################################################################
Pearson 
PearsonRResult(statistic=0.26572842333653923, pvalue=4.214296990969587e-07)
PearsonRResult(statistic=0.05102892624806722, pvalue=0.339777896217168)
PearsonRResult(statistic=0.21697565961231546, pvalue=4.0369391220383806e-05)
Spearman 
SpearmanrResult(correlation=0.324400489230381, pvalue=4.5457429629669424e-10)
SpearmanrResult(correlation=-0.011465814304130732, pvalue=0.8302666223953653)
SpearmanrResult(correlation=0.21800782093534113, pvalue=3.7014431035413466e-05)


### Seconda parte

Implementare l’algoritmo di Lesk (NON (!=) usare implementazione esistente, e.g., in nltk…).
1. Estrarre 50 frasi dal corpus SemCor (corpus annotato con i synset di
WN) e disambiguare (almeno) un sostantivo per frase. 
Calcolare l’accuratezza del sistema implementato sulla base dei sensi annotati in
SemCor.
    - SemCor è disponibile all’URL http://web.eecs.umich.edu/~mihalcea/downloads.html
2. Randomizzare la selezione delle 50 frasi e la selezione del termine da disambiguare, 
e restituire l’accuratezza media su (per esempio) 10 esecuzioni del programma.

In [14]:
stop_words = get_stopwords()

def get_set_of_words(phrase: str) -> list:
    """
    Returns the set of words of a phrase.
    """
    phrase = remove_punctuation(phrase)
    set_of_words = remove_stopwords(phrase, stop_words)

    return set_of_words

def get_context_of_phrase(sentence: str) -> list:
    """
    Returns the context of a phrase.
    """
    sentence = remove_punctuation(sentence)
    set_of_words = remove_stopwords(sentence, stop_words)

    return set_of_words

def compute_overlap(signature: list, context: list) -> int:
    """
    Returns the number of words in common between signature and context
    """

    number_of_words_in_common = len(list(set(signature) & set(context)))

    return number_of_words_in_common

def get_signature(synset: Synset) -> list:
    """
    Returns the signature of synset, i.e. the set of words that are contained in the gloss and examples of synset.
    """

    gloss_of_synset = synset.definition() #gloss of synset
    examples_of_synset = synset.examples() #examples of synset
    initial_signature = [gloss_of_synset] + examples_of_synset

    signature = []
    for phrase in initial_signature:
        set_of_words = get_set_of_words(phrase)
        signature.extend(set_of_words)

    return list(set(signature)) # remove duplicates

In [15]:
def lesk(word: str, sentence: str) -> Synset:
    """
    Returns the best sense of given word used in sentence.
    """ 

    max_overlap = 0
    context = get_set_of_words(sentence)
    synsets = wn.synsets(word)
    print(wn.synsets(word))
    best_sense = None

    if synsets != []:
        best_sense = synsets[0]

    for synset in synsets:
        signature = get_signature(synset) 
        overlap = compute_overlap(signature, context)

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = synset

    return best_sense

In [16]:
# Test
best_sense = lesk("bank", "I went to the bank to deposit money.")

best_sense_nltk = nltk.wsd.lesk("I went to the bank to deposit money.".split(), "bank")

print("best sense:")
print(best_sense) 

print("best sense nltk:", best_sense_nltk)

[Synset('bank.n.01'), Synset('depository_financial_institution.n.01'), Synset('bank.n.03'), Synset('bank.n.04'), Synset('bank.n.05'), Synset('bank.n.06'), Synset('bank.n.07'), Synset('savings_bank.n.02'), Synset('bank.n.09'), Synset('bank.n.10'), Synset('bank.v.01'), Synset('bank.v.02'), Synset('bank.v.03'), Synset('bank.v.04'), Synset('bank.v.05'), Synset('deposit.v.02'), Synset('bank.v.07'), Synset('trust.v.01')]
best sense:
Synset('depository_financial_institution.n.01')
best sense nltk: Synset('bank.v.07')


#### 1. Extraction first 50 phrases and disambiguation of (at least) one name

In [17]:
corpus = semcor.sents()

def lemma_list(sent):
    return [l.label() for l in sent if isinstance(l, nltk.tree.Tree)]

def disambiguation(sentences: list[str], sentences_tagged: list) -> None:
    lesk_correct = 0 
    total_names = 0 

    for sentence, sentence_tagged in zip(sentences, sentences_tagged):
        
        lemmas = lemma_list(sentence_tagged)

        lemmas_length = len(lemmas)
        names_disambiguated = 0 # count to stop the search of disambiguation
        
        j = 0
        while (j < lemmas_length and names_disambiguated < 3):
            string = str(lemmas[j])
            string = string.replace("Lemma('","")
            string = string.replace("')", "") # jury.n.01.jury
            synset_string = string.rsplit('.', 1)[0] # jury.n.01
            
            parts = string.split('.')   # jury.n.01.jury --> ['jury', 'n', '01', 'jury']
            
            # Lemma('jury.n.01.jury') --> "synset".paroladeltesto
            # parts[0] = jury
            # parts[1] = n (può essere 'n', 'r', 'v', ...)
            # parts[2] = 01
            # parts[3] = jury # lemma della parola del testo che ha "taggato"

            if len(parts) == 4 and parts[1] == 'n': #and parts[3] in sentences[i]):
                total_names += 1
                names_disambiguated += 1
                lesk_synset = lesk(parts[3],sentence) 
                print("Phrase: ", sentence)
                print("Word: "+parts[3])
                print("Lesk algorithm: " + str(lesk_synset))
                print("True Synset: " + synset_string)

                
                if lesk_synset:
                    lesk_name_syn = str(lesk_synset)
                    lesk_name_syn = lesk_name_syn.replace("Synset('", "")
                    lesk_name_syn = lesk_name_syn.replace("')","")

                    if (synset_string == lesk_name_syn): # confront synset true with lesk one 
                        lesk_correct = lesk_correct + 1

            j += 1
        lemmas.clear()
        
    print("Numero di volte che lesk ha trovato la giusta definizione: " + str(lesk_correct))
    print("Numero di nomi totali da disambiguare: " + str(total_names))
        
    return lesk_correct / total_names


In [18]:
def first_50_phrases() -> list[list[str], list]:
    sentences = corpus[0:50]
    sentences_tagged = semcor.tagged_sents(tag="sem")[0:50]

    return sentences, sentences_tagged

sentences, sentences_tagged = first_50_phrases()
disambiguation(sentences, sentences_tagged)

[Synset('group.n.01'), Synset('group.n.02'), Synset('group.n.03'), Synset('group.v.01'), Synset('group.v.02')]
Phrase:  ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
Word: group
Lesk algorithm: Synset('group.n.01')
True Synset: group.n.01
[Synset('friday.n.01')]
Phrase:  ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
Word: Friday
Lesk algorithm: Synset('friday.n.01')
True Synset: friday.n.01
[Synset('probe.n.01'), Synset('investigation.n.02')]
Phrase:  ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``'

0.673469387755102

#### 2. Random extraction of 50 phrases from corpus

In [19]:
def generate_random_numbers_in_range(low: int, high: int, n: int) -> list[int]:
    return random.sample(range(low, high), k=n)

def random_50_phrases() -> tuple[list[str], list]:
    random_indexes = generate_random_numbers_in_range(0, 5000, 50)
    sentences = []
    sentences_tagged = []
    corpus_tagged = semcor.tagged_sents(tag="sem")[:5000]
    for i in random_indexes: 
        sentences.append(corpus[i])
        sentences_tagged.append(corpus_tagged[i])

    return sentences, sentences_tagged

In [20]:
sentences, sentences_tagged = random_50_phrases()

disambiguation(sentences, sentences_tagged)

[Synset('meaning.n.01'), Synset('meaning.n.02'), Synset('mean.v.01'), Synset('entail.v.01'), Synset('mean.v.03'), Synset('intend.v.01'), Synset('mean.v.05'), Synset('think_of.v.04'), Synset('mean.v.07'), Synset('meaning.s.01')]
Phrase:  ['The', 'meaning', 'of', 'the', 'word', 'is', 'quite', 'physical', ',', 'to', 'begin', 'with', '.']
Word: meaning
Lesk algorithm: Synset('meaning.n.01')
True Synset: meaning.n.02
[Synset('word.n.01'), Synset('word.n.02'), Synset('news.n.01'), Synset('word.n.04'), Synset('discussion.n.02'), Synset('parole.n.01'), Synset('word.n.07'), Synset('son.n.02'), Synset('password.n.01'), Synset('bible.n.01'), Synset('give_voice.v.01')]
Phrase:  ['The', 'meaning', 'of', 'the', 'word', 'is', 'quite', 'physical', ',', 'to', 'begin', 'with', '.']
Word: word
Lesk algorithm: Synset('word.n.02')
True Synset: word.n.01
[Synset('religion.n.01'), Synset('religion.n.02')]
Phrase:  ['Religion', 'fosters', 'group', 'life', 'in', 'various', 'ways', '.']
Word: religion
Lesk algo

0.5925925925925926

In [21]:
def execute_multiple_times() -> int:
    n_executions = 10

    result_score = 0

    for i in range(0, n_executions): 
        sentences, sentences_tagged = random_50_phrases()
        score = disambiguation(sentences, sentences_tagged)

        result_score += score

    print("Final Accuracy: " + str(result_score / n_executions))

execute_multiple_times()

[Synset('tug.n.01'), Synset('tugboat.n.01'), Synset('tug.v.01'), Synset('tug.v.02'), Synset('tug.v.03'), Synset('lug.v.01'), Synset('tug.v.05'), Synset('tug.v.06'), Synset('tug.v.07')]
Phrase:  ['As', 'soon', 'as', 'an', 'experimental', 'tug', 'assures', 'you', 'that', 'roots', 'have', 'taken', 'over', ',', 'cut', 'it', 'off', 'from', 'the', 'mother', 'plant', '.']
Word: tug
Lesk algorithm: Synset('tug.v.03')
True Synset: tug.n.01
[Synset('root.n.01'), Synset('beginning.n.04'), Synset('root.n.03'), Synset('root.n.04'), Synset('solution.n.04'), Synset('ancestor.n.01'), Synset('etymon.n.01'), Synset('root.n.08'), Synset('root.v.01'), Synset('root.v.02'), Synset('root.v.03'), Synset('rout.v.02'), Synset('settle.v.07'), Synset('root.v.06')]
Phrase:  ['As', 'soon', 'as', 'an', 'experimental', 'tug', 'assures', 'you', 'that', 'roots', 'have', 'taken', 'over', ',', 'cut', 'it', 'off', 'from', 'the', 'mother', 'plant', '.']
Word: root
Lesk algorithm: Synset('root.v.01')
True Synset: root.n.01
