In [1]:
import nltk
nltk.download('genesis')
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/jason13nn/nltk_data...
[nltk_data]   Package genesis is already up-to-date!


## Path similarity

In [2]:
def word_path_similarity(w1, w2, pos1 = None, pos2 = None, option = None):
    
    ##Definition of pos1 and pos2
    ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
    
    if pos1 == "A" and pos2 == "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    elif pos1 == "A" and pos2 != "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, pos2) 
    elif pos1 != "A" and pos2 == "A":
        x = wn.synsets(w1, pos1) 
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    else:
        x = wn.synsets(w1, pos1)  
        y = wn.synsets(w2, pos2) 
    
    ##Definition of option
    #Generate all pairs of two senses
    similarity_list = []
    
    for word1 in x:
        pairs = ([word1.path_similarity(word2) for word2 in y if word1.path_similarity(word2) is not None ])
        similarity_list.append(pairs)
        similarity_list = [i for i in similarity_list if i] 
        
        if option == "first":
            answer = x[0].path_similarity(y[0])
            
        elif option == "avg":
            answer = np.mean([float(sum(col))/len(col) for col in zip(*similarity_list)])
            
        elif option == "min":
            answer = min(min(similarity_list))
            
        elif option == None:
            answer = max(max(similarity_list))
   
    return answer

## Wu-Palmer Similarity

In [3]:
def word_wup_similarity(w1, w2, pos1 = None, pos2 = None, option = None):
    
    ##Definition of pos1 and pos2
    ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
    
    if pos1 == "A" and pos2 == "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    elif pos1 == "A" and pos2 != "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, pos2) 
    elif pos1 != "A" and pos2 == "A":
        x = wn.synsets(w1, pos1) 
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    else:
        x = wn.synsets(w1, pos1)  
        y = wn.synsets(w2, pos2) 
    
    ##Definition of option
    #Generate all pairs of two senses
    similarity_list = []
    
    for word1 in x:
        pairs = ([word1.wup_similarity(word2) for word2 in y if word1.wup_similarity(word2) is not None ])
        similarity_list.append(pairs)
        similarity_list = [i for i in similarity_list if i] 
        
        if option == "first":
            answer = x[0].wup_similarity(y[0])
            
        elif option == "avg":
            answer = np.mean([float(sum(col))/len(col) for col in zip(*similarity_list)])
            
        elif option == "min":
            answer = min(min(similarity_list))
            
        elif option == None:
            answer = max(max(similarity_list))
   
    return answer

## Leacock-Chodorow Similarity

In [17]:
def word_lch_similarity(w1, w2, pos1 = None, pos2 = None, option = None):
    
    ##Definition of pos1 and pos2
    ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
    
    if pos1 == "A" and pos2 == "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    elif pos1 == "A" and pos2 != "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, pos2) 
    elif pos1 != "A" and pos2 == "A":
        x = wn.synsets(w1, pos1) 
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    else:
        x = wn.synsets(w1, pos1)  
        y = wn.synsets(w2, pos2) 
    
    ##Definition of option
    #Generate all pairs of two senses
    similarity_list = []
    
    for word1 in x:
        pairs = ([word1.lch_similarity(word2) for word2 in y if word1.pos() == word2.pos()])
        similarity_list.append(pairs)
        similarity_list = [i for i in similarity_list if i] 
        
        if option == "first":
            answer = x[0].lch_similarity(y[0])
            
        elif option == "avg":
            answer = np.mean([float(sum(col))/len(col) for col in zip(*similarity_list)])
            
        elif option == "min":
            answer = min(min(similarity_list))
            
        elif option == None:
            answer = max(max(similarity_list))
   
    return answer

## Resnik Similarity 

In [13]:
def word_res_similarity(ic, w1, w2, pos1 = None, pos2 = None, option = None):
    
    #Create an information content dictionary from a corpus 
    ic = wn.ic(ic, False, 0.0)
    
    ##Definition of pos1 and pos2
    ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
    
    if pos1 == "A" and pos2 == "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    elif pos1 == "A" and pos2 != "A":
        x = wn.synsets(w1, "a")  + wn.synsets(w1, "s")
        y = wn.synsets(w2, pos2) 
    elif pos1 != "A" and pos2 == "A":
        x = wn.synsets(w1, pos1) 
        y = wn.synsets(w2, "a")  + wn.synsets(w2, "s")
    else:
        x = wn.synsets(w1, pos1)  
        y = wn.synsets(w2, pos2) 
    
    ##Definition of option
    #Generate all pairs of two senses
    similarity_list = []
    
    for word1 in x:
        pairs = ([word1.res_similarity(word2, ic) for word2 in y if word1.pos() == word2.pos()])
        similarity_list.append(pairs)
        similarity_list = [i for i in similarity_list if i] 
        
        if option == "first":
            answer = x[0].res_similarity(y[0], ic)
            
        elif option == "avg":
            answer = np.mean([float(sum(col))/len(col) for col in zip(*similarity_list)])
            
        elif option == "min":
            answer = min(min(similarity_list))
            
        elif option == None:
            answer = max(max(similarity_list))
   
    return answer

## Example

In [6]:
w1='like'
w2='love'
pos1=None
pos2=None
option='avg'
ic=genesis

In [7]:
word_path_similarity(w1,w2, option=option)

0.0922941476512905

In [8]:
word_wup_similarity(w1,w2, option=option)

0.20024181389727605

In [18]:
word_lch_similarity(w1, w2, option=option)

1.4946824541440682

In [14]:
word_res_similarity(ic=ic, w1=w1,w2=w2, option=option)

0.5606817596183055