# **Concept Similarity with WordNet**

In [152]:
import pandas as pd
from nltk.corpus import wordnet as wn
from scipy import stats

## Wu & Palmer Similarity

In [153]:
# Depth of a synset - number of steps from the synset to the root
def depth(s):
    if s is None:
        return -1
    hypernyms = s.hypernyms()
    if len(hypernyms) == 0:
        return 0
    else:
        return 1 + max([depth(h) for h in hypernyms]) # add 1 to get the last step to the synset s

# Get all hypernyms of a synset (including itself)
def all_hypernyms(s):
    hypernyms = [s]
    for h in s.hypernyms():
        hypernyms.extend(all_hypernyms(h))
    return hypernyms

# Lowest Common Subsumer
def lcs(s1, s2):
    hyper1 = all_hypernyms(s1)
    hyper2 = all_hypernyms(s2)
    if len(hyper1) == 0 or len(hyper2) == 0:
        return None
    candidates = set(hyper1).intersection(set(hyper2))
    if len(candidates) > 0:
        return max(candidates, key=depth)
    else:
        return None

# Wu and Palmer similarity
def wup_sim(s1, s2):
    lcs_syn = lcs(s1, s2)
    if lcs_syn is None:
        return 0
    return 2 * depth(lcs_syn) / (depth(s1) + depth(s2))

## Shortest Path Similarity

In [154]:
# Find max depth of all synsets in wordnet
def max_depth():
    max = 0
    for synset in wn.all_synsets():
        d = depth(synset)
        if d > max:
            max = d
    return max

max_depth = max_depth()

In [155]:
# Find shortest path between two synsets
def shortest_path(s1, s2):
    lcs_syn = lcs(s1, s2)
    if lcs_syn is None:
        return max_depth
    return depth(s1) + depth(s2) - 2 * depth(lcs_syn)

# Shortest path similarity
def short_path_sim(s1, s2):
    return 2 * max_depth - shortest_path(s1, s2)

# Prove
print(lcs(wn.synset('cat.n.01'), wn.synset('cat.n.01')))
print(all_hypernyms(wn.synset('cat.n.01')))
print(all_hypernyms(wn.synset('cat.n.01')))
print(shortest_path(wn.synset('cat.n.01'), wn.synset('cat.n.01')))
print(short_path_sim(wn.synset('cat.n.01'), wn.synset('cat.n.01')))

Synset('cat.n.01')
[Synset('cat.n.01'), Synset('feline.n.01'), Synset('carnivore.n.01'), Synset('placental.n.01'), Synset('mammal.n.01'), Synset('vertebrate.n.01'), Synset('chordate.n.01'), Synset('animal.n.01'), Synset('organism.n.01'), Synset('living_thing.n.01'), Synset('whole.n.02'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')]
[Synset('cat.n.01'), Synset('feline.n.01'), Synset('carnivore.n.01'), Synset('placental.n.01'), Synset('mammal.n.01'), Synset('vertebrate.n.01'), Synset('chordate.n.01'), Synset('animal.n.01'), Synset('organism.n.01'), Synset('living_thing.n.01'), Synset('whole.n.02'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')]
0
38


Dato che l'input in WordSim353 è una coppia di termini, mentre le tre formule utilizzano sensi, per calcolare la similarity fra i 2 termini prendiamo la massima similarity fra tutti i sensi del primo termine e tutti i sensi del secondo termine.

In [184]:
# Get max similarity between all synsets of two words, given a similarity function
def get_max_similarity(s1, s2, sim_func):
    s1_synsets = wn.synsets(s1)
    s2_synsets = wn.synsets(s2)
    max_sim = 0
    # best_s1 = None
    # best_s2 = None
    for s1_syn in s1_synsets:
        for s2_syn in s2_synsets:
            sim = sim_func(s1_syn, s2_syn)
            if sim > max_sim:
                max_sim = sim
                # best_s1 = s1_syn
                # best_s2 = s2_syn
    return max_sim


# get lcs for dog and cat
# print(get_lcs(wn.synset('cat.n.01'), wn.synset('cat.n.01')))
print(get_max_similarity('love', 'sex', wup_sim))
# print(depth(wn.synset('forecast.n.01')))
# print(all_hypernyms(wn.synset('sexual_love.n.02')))
# print(all_hypernyms(wn.synset('sex.n.01')))
# print(lcs(wn.synset('sexual_love.n.02'), wn.synset('sex.n.01')))
print(get_max_similarity('love', 'sex', short_path_sim))



0.9090909090909091
37


In [185]:
# Load the data
df = pd.read_csv('/Users/jak/Documents/Uni/TLN/TLN /Radicioni/data/WordSim353.csv')

In [186]:
# Prova la get_max_similarity su tutte le coppie di parole
df['wup_similarity'] = df.apply(lambda row: get_max_similarity(row['Word 1'], row['Word 2'], wup_sim), axis=1)
df['short_path_similarity'] = df.apply(lambda row: get_max_similarity(row['Word 1'], row['Word 2'], short_path_sim), axis=1)

# Show the results
print(df)

# Results for Wu and Palmer similarity
print('Wu and Palmer similarity results')
spearman = stats.spearmanr(df['Human (mean)'], df['wup_similarity'])
print(f'Spearman correlation: {spearman[0]}, p-value: {spearman[1]}')
pearson = stats.pearsonr(df['Human (mean)'], df['wup_similarity'])
print(f'Pearson correlation: {pearson[0]}, p-value: {pearson[1]}')

print('--------------------------------')

# Results for Shortest Path similarity
print('Shortest Path similarity results')
spearman = stats.spearmanr(df['Human (mean)'], df['short_path_similarity'])
print(f'Spearman correlation: {spearman[0]}, p-value: {spearman[1]}')
pearson = stats.pearsonr(df['Human (mean)'], df['short_path_similarity'])
print(f'Pearson correlation: {pearson[0]}, p-value: {pearson[1]}')

           Word 1    Word 2  Human (mean)  wup_similarity  \
0            love       sex          6.77        0.909091   
1           tiger       cat          7.35        0.962963   
2           tiger     tiger         10.00        1.000000   
3            book     paper          7.46        0.857143   
4        computer  keyboard          7.62        0.800000   
..            ...       ...           ...             ...   
348        shower     flood          6.03        0.600000   
349       weather  forecast          8.34        0.000000   
350      disaster      area          6.25        0.428571   
351      governor    office          6.34        0.470588   
352  architecture   century          3.78        0.181818   

     short_path_similarity  
0                       37  
1                       37  
2                       38  
3                       36  
4                       35  
..                     ...  
348                     34  
349                     25  
350   

In [9]:
# Get max similarity for each word pair from all synsets
def get_max_similarity(word1, word2):
    max_similarity = 0
    for synset1 in wn.synsets(word1):
        for synset2 in wn.synsets(word2):
            similarity = synset1.path_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity
    return max_similarity