# 1 - Concept Similarity with WordNet

In [80]:
import pandas as pd
from nltk.corpus import wordnet as wn

### Implementazione di 3 misure di similarità basate su WordNet:  
* Wu-Palmer
* Shortest Path
* Leacock-Chodorow

In [100]:
# Depth of a synset
def depth(s):
    if s is None:
        return -1
    hypernyms = s.hypernyms()
    if len(hypernyms) == 0:
        return 0
    else:
        return 1 + max([depth(h) for h in hypernyms])

# Lowest Common Subsumer
def get_lcs(s1, s2):
    s1_hypernyms = set(s1.hypernyms())
    # print(s1, s1_hypernyms)
    s2_hypernyms = set(s2.hypernyms())
    # print(s2, s2_hypernyms)
    if len(s1_hypernyms) == 0 or len(s2_hypernyms) == 0:
        return None
    lcs_candidates = s1_hypernyms.intersection(s2_hypernyms)
    # print(lcs_candidates)
    if len(lcs_candidates) > 0:
        return max(lcs_candidates, key=depth)
    else:
        lcs_candidates = [get_lcs(h1, h2) for h1 in s1_hypernyms for h2 in s2_hypernyms]
        lcs_candidates = [x for x in lcs_candidates if x is not None]
        # print(lcs_candidates)
        if len(lcs_candidates) > 0:
            return max(lcs_candidates, key=depth)
        else:
            return None

# Wu and Palmer similarity
def wup_sim(s1, s2):
    lcs = get_lcs(s1, s2)
    if lcs is None:
        return 0
    return 2 * depth(lcs) / (depth(s1) + depth(s2))

# Get max similarity between all synsets of two words
def get_max_similarity(s1, s2):
    s1_synsets = wn.synsets(s1)
    s2_synsets = wn.synsets(s2)
    max_sim = 0
    # best_s1 = None
    # best_s2 = None
    for s1_synset in s1_synsets:
        for s2_synset in s2_synsets:
            sim = wup_sim(s1_synset, s2_synset)
            if sim > max_sim:
                max_sim = sim
                # best_s1 = s1_synset
                # best_s2 = s2_synset
    return max_sim


# get lcs for jaguar and cat
# print(get_lcs(wn.synset('jaguar.n.01'), wn.synset('cat.n.01')))
print(get_max_similarity('computer', 'keyboard'))




0


Dato che l'input in WordSim353 è una coppia di termini, mentre le tre formule utilizzano sensi, per calcolare la similarity fra i 2 termini prendiamo la massima similarity fra tutti i sensi del primo termine e tutti i sensi del secondo termine.

In [63]:
# Load the data
df = pd.read_csv('/Users/jak/Documents/Uni/TLN/TLN /Radicioni/data/WordSim353.csv')

In [90]:
# Prova la get_max_similarity su tutte le coppie di parole
df['max_similarity'] = df.apply(lambda row: get_max_similarity(row['Word 1'], row['Word 2']), axis=1)

# get max similarity between all synsets of two words using NLTK wup_similarity
def get_max_sim2(s1, s2):
    s1_synsets = wn.synsets(s1)
    s2_synsets = wn.synsets(s2)
    max_sim = 0
    for s1_synset in s1_synsets:
        for s2_synset in s2_synsets:
            sim = s1_synset.wup_similarity(s2_synset)
            if sim is not None and sim > max_sim:
                max_sim = sim
    return max_sim

# Prova la get_max_sim2 su tutte le coppie di parole
df['max_similarity2'] = df.apply(lambda row: get_max_sim2(row['Word 1'], row['Word 2']), axis=1)

# Show the results
df


Unnamed: 0,Word 1,Word 2,Human (mean),max_similarity,max_similarity2
0,love,sex,6.77,0.333333,0.923077
1,tiger,cat,7.35,0.142857,0.965517
2,tiger,tiger,10.0,0.928571,1.0
3,book,paper,7.46,0.857143,0.875
4,computer,keyboard,7.62,0.0,0.823529
5,computer,internet,7.58,0.0,0.631579
6,plane,car,5.77,0.636364,0.727273
7,train,car,6.31,0.588235,0.736842
8,telephone,communication,7.5,0.0,0.2
9,television,radio,6.77,0.9,0.909091


In [9]:
# Get max similarity for each word pair from all synsets
def get_max_similarity(word1, word2):
    max_similarity = 0
    for synset1 in wn.synsets(word1):
        for synset2 in wn.synsets(word2):
            similarity = synset1.path_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity
    return max_similarity