# **Concept Similarity with WordNet**

In [12]:
import pandas as pd
from nltk.corpus import wordnet as wn
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt

## Wu & Palmer Similarity

In [5]:
# Depth of a synset - number of steps from the synset to the root
def depth(s):
    if s is None:
        return -1
    hypernyms = s.hypernyms()
    if len(hypernyms) == 0:
        return 0
    else:
        return 1 + max([depth(h) for h in hypernyms]) # add 1 to get the last step to the synset s

# Get all hypernyms of a synset (including itself)
def all_hypernyms(s):
    hypernyms = [s]
    for h in s.hypernyms():
        hypernyms.extend(all_hypernyms(h))
    return hypernyms

# Lowest Common Subsumer
def lcs(s1, s2):
    hyper1 = all_hypernyms(s1)
    hyper2 = all_hypernyms(s2)
    if len(hyper1) == 0 or len(hyper2) == 0:
        return None
    candidates = set(hyper1).intersection(set(hyper2))
    if len(candidates) > 0:
        return max(candidates, key=depth)
    else:
        return None

# Wu and Palmer similarity
def wup_sim(s1, s2):
    lcs_syn = lcs(s1, s2)
    if lcs_syn is None:
        return 0
    return 2 * depth(lcs_syn) / (depth(s1) + depth(s2))

## Shortest Path Similarity

In [6]:
# Find max depth of all synsets in wordnet
def max_depth():
    max = 0
    for synset in wn.all_synsets():
        d = depth(synset)
        if d > max:
            max = d
    return max

# max_depth = max_depth() it is always 19, so we can hardcode it
max_depth = 19

In [7]:
# Find shortest path between two synsets
def shortest_path(s1, s2):
    lcs_syn = lcs(s1, s2)
    if lcs_syn is None:
        return max_depth
    return depth(s1) + depth(s2) - 2 * depth(lcs_syn)

# Shortest path similarity
def short_path_sim(s1, s2):
    return 2 * max_depth - shortest_path(s1, s2)

## Leacock & Chodorow Similarity

In [8]:
# Leacock and Chodorow similarity
def leac_sim(s1, s2):
    distance = shortest_path(s1, s2)
    if distance == 0:
        return -np.log(distance + 1 / (2 * max_depth + 1))
    return -np.log(distance / (2 * max_depth))

Dato che l'input in WordSim353 è una coppia di termini, mentre le tre formule utilizzano sensi, per calcolare la similarity fra i 2 termini prendiamo la massima similarity fra tutti i sensi del primo termine e tutti i sensi del secondo termine.

In [9]:
# Get max similarity between all synsets of two words, given a similarity function
def get_max_similarity(s1, s2, sim_func):
    s1_synsets = wn.synsets(s1)
    s2_synsets = wn.synsets(s2)
    max_sim = 0
    # best_s1 = None
    # best_s2 = None
    for s1_syn in s1_synsets:
        for s2_syn in s2_synsets:
            sim = sim_func(s1_syn, s2_syn)
            if sim > max_sim:
                max_sim = sim
                # best_s1 = s1_syn
                # best_s2 = s2_syn
    return max_sim

In [18]:
# Load the data
df = pd.read_csv('/Users/jak/Documents/Uni/TLN/TLN /Radicioni/data/WordSim353.csv')

### Compute similarities for WordSim353 and print results

In [21]:
# Compute the three similarity measures
df['wup'] = df.apply(lambda row: get_max_similarity(row['Word 1'], row['Word 2'], wup_sim), axis=1)
df['shortest'] = df.apply(lambda row: get_max_similarity(row['Word 1'], row['Word 2'], short_path_sim), axis=1)
df['leacock'] = df.apply(lambda row: get_max_similarity(row['Word 1'], row['Word 2'], leac_sim), axis=1)

# Show the dataframe
print(df.head(10))

# Results for Wu and Palmer similarity
print('\nWu and Palmer similarity results')
spearman = stats.spearmanr(df['Human (mean)'], df['wup'])
print(f'Spearman correlation: {spearman[0]}, p-value: {spearman[1]}')
pearson = stats.pearsonr(df['Human (mean)'], df['wup'])
print(f'Pearson correlation: {pearson[0]}, p-value: {pearson[1]}')

print('--------------------------------')

# Results for Shortest Path similarity
print('Shortest Path similarity results')
spearman = stats.spearmanr(df['Human (mean)'], df['shortest'])
print(f'Spearman correlation: {spearman[0]}, p-value: {spearman[1]}')
pearson = stats.pearsonr(df['Human (mean)'], df['shortest'])
print(f'Pearson correlation: {pearson[0]}, p-value: {pearson[1]}')

print('--------------------------------')

# Results for Leacock and Chodorow similarity
print('Leacock and Chodorow similarity results')
spearman = stats.spearmanr(df['Human (mean)'], df['leacock'])
print(f'Spearman correlation: {spearman[0]}, p-value: {spearman[1]}')
pearson = stats.pearsonr(df['Human (mean)'], df['leacock'])
print(f'Pearson correlation: {pearson[0]}, p-value: {pearson[1]}')

       Word 1         Word 2  Human (mean)       wup  shortest   leacock
0        love            sex          6.77  0.909091        37  3.637586
1       tiger            cat          7.35  0.962963        37  3.637586
2       tiger          tiger         10.00  1.000000        38  3.663562
3        book          paper          7.46  0.857143        36  2.944439
4    computer       keyboard          7.62  0.800000        35  2.538974
5    computer       internet          7.58  0.588235        31  1.691676
6       plane            car          5.77  0.700000        32  1.845827
7       train            car          6.31  0.705882        33  2.028148
8   telephone  communication          7.50  0.000000        28  1.335001
9  television          radio          6.77  0.900000        36  2.944439

Wu and Palmer similarity results
Spearman correlation: 0.32205979107230365, p-value: 5.816296751820106e-10
Pearson correlation: 0.2756318557372003, p-value: 1.4205427728723012e-07
----------------