# 1 - Concept Similarity with WordNet

In [124]:
import pandas as pd
from nltk.corpus import wordnet as wn
from scipy import stats

### Implementazione di 3 misure di similarità basate su WordNet:  
* Wu-Palmer
* Shortest Path
* Leacock-Chodorow

In [126]:
# Depth of a synset
def depth(s):
    if s is None:
        return -1
    hypernyms = s.hypernyms()
    if len(hypernyms) == 0:
        return 0
    else:
        return 1 + max([depth(h) for h in hypernyms]) # add 1 to get the last step to the synset s

# Get all hypernyms of a synset (including itself)
def get_hypernyms(s):
    hypernyms = [s]
    for h in s.hypernyms():
        hypernyms.extend(get_hypernyms(h))
    return hypernyms

# Lowest Common Subsumer
def lcs(s1, s2):
    hyper1 = get_hypernyms(s1)
    hyper2 = get_hypernyms(s2)
    if len(hyper1) == 0 or len(hyper2) == 0:
        return None
    candidates = set(hyper1).intersection(set(hyper2))
    if len(candidates) > 0:
        return max(candidates, key=depth)
    else:
        return None

# Wu and Palmer similarity
def wup_sim(s1, s2):
    lcs_syn = lcs(s1, s2)
    if lcs_syn is None:
        return 0
    return 2 * depth(lcs_syn) / (depth(s1) + depth(s2))

# Get max similarity between all synsets of two words
def get_max_similarity(s1, s2):
    s1_synsets = wn.synsets(s1)
    s2_synsets = wn.synsets(s2)
    max_sim = 0
    # best_s1 = None
    # best_s2 = None
    for s1_synset in s1_synsets:
        for s2_synset in s2_synsets:
            sim = wup_sim(s1_synset, s2_synset)
            if sim > max_sim:
                max_sim = sim
                # best_s1 = s1_synset
                # best_s2 = s2_synset
    return max_sim


# get lcs for dog and cat
# print(get_lcs(wn.synset('cat.n.01'), wn.synset('cat.n.01')))
# print(get_max_similarity('dog', 'cat'))
print(depth(wn.synset('forecast.n.01')))
print(get_hypernyms(wn.synset('forecast.n.01')))
print(get_hypernyms(wn.synset('weather.n.01')))




6
[Synset('prognosis.n.01'), Synset('prediction.n.02'), Synset('statement.n.01'), Synset('message.n.02'), Synset('communication.n.02'), Synset('abstraction.n.06'), Synset('entity.n.01')]
[Synset('weather.n.01'), Synset('atmospheric_phenomenon.n.01'), Synset('physical_phenomenon.n.01'), Synset('natural_phenomenon.n.01'), Synset('phenomenon.n.01'), Synset('process.n.06'), Synset('physical_entity.n.01'), Synset('entity.n.01')]


Dato che l'input in WordSim353 è una coppia di termini, mentre le tre formule utilizzano sensi, per calcolare la similarity fra i 2 termini prendiamo la massima similarity fra tutti i sensi del primo termine e tutti i sensi del secondo termine.

In [122]:
# Load the data
df = pd.read_csv('/Users/jak/Documents/Uni/TLN/TLN /Radicioni/data/WordSim353.csv')

In [123]:
# Prova la get_max_similarity su tutte le coppie di parole
df['max_similarity'] = df.apply(lambda row: get_max_similarity(row['Word 1'], row['Word 2']), axis=1)

# get max similarity between all synsets of two words using NLTK wup_similarity
def get_max_sim2(s1, s2):
    s1_synsets = wn.synsets(s1)
    s2_synsets = wn.synsets(s2)
    max_sim = 0
    for s1_synset in s1_synsets:
        for s2_synset in s2_synsets:
            sim = s1_synset.wup_similarity(s2_synset)
            if sim is not None and sim > max_sim:
                max_sim = sim
    return max_sim

# Prova la get_max_sim2 su tutte le coppie di parole
df['max_similarity2'] = df.apply(lambda row: get_max_sim2(row['Word 1'], row['Word 2']), axis=1)

# Show the results
print(df)

# use scipy.stats.spearmanr to calculate the Spearman correlation
print(stats.spearmanr(df['Human (mean)'], df['max_similarity']))
print(stats.spearmanr(df['Human (mean)'], df['max_similarity2']))

# use scipy.stats.pearsonr to calculate the Pearson correlation
print(stats.pearsonr(df['Human (mean)'], df['max_similarity']))
print(stats.pearsonr(df['Human (mean)'], df['max_similarity2']))


           Word 1    Word 2  Human (mean)  max_similarity  max_similarity2
0            love       sex          6.77        0.909091         0.923077
1           tiger       cat          7.35        0.962963         0.965517
2           tiger     tiger         10.00        1.000000         1.000000
3            book     paper          7.46        0.857143         0.875000
4        computer  keyboard          7.62        0.800000         0.823529
..            ...       ...           ...             ...              ...
348        shower     flood          6.03        0.600000         0.636364
349       weather  forecast          8.34        0.000000         0.333333
350      disaster      area          6.25        0.428571         0.500000
351      governor    office          6.34        0.470588         0.526316
352  architecture   century          3.78        0.181818         0.307692

[353 rows x 5 columns]
SpearmanrResult(correlation=0.32205979107230365, pvalue=5.816296751820106e-1

In [9]:
# Get max similarity for each word pair from all synsets
def get_max_similarity(word1, word2):
    max_similarity = 0
    for synset1 in wn.synsets(word1):
        for synset2 in wn.synsets(word2):
            similarity = synset1.path_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity
    return max_similarity