https://stackoverflow.com/questions/41793842/wordnet-python-words-similarity

http://www.nltk.org/howto/wordnet.html

https://stackoverflow.com/questions/13513455/drawing-a-graph-or-a-network-from-a-distance-matrix

In [1]:
from nltk.corpus import wordnet as wn

In [2]:
wn.synsets('dog')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [3]:
wn.synsets('dog', pos=wn.VERB)

[Synset('chase.v.01')]

In [4]:
print(wn.synset('dog.n.01').definition())

a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds


In [5]:
print(wn.synset('dog.n.01').examples()[0])

the dog barked all night


In [6]:
wn.synset('dog.n.01').lemmas()

[Lemma('dog.n.01.dog'),
 Lemma('dog.n.01.domestic_dog'),
 Lemma('dog.n.01.Canis_familiaris')]

In [7]:
[str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()]

['dog', 'domestic_dog', 'Canis_familiaris']

In [8]:
wn.lemma('dog.n.01.dog').synset()

Synset('dog.n.01')

In [9]:
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')
hit = wn.synset('hit.v.01')
slap = wn.synset('slap.v.01')

print(dog.path_similarity(cat))  # doctest: +ELLIPSIS

print(hit.path_similarity(slap))  # doctest: +ELLIPSIS

print(wn.path_similarity(hit, slap))  # doctest: +ELLIPSIS

print(hit.path_similarity(slap, simulate_root=False))
print(wn.path_similarity(hit, slap, simulate_root=False))

0.2
0.14285714285714285
0.14285714285714285
None
None


In [11]:
%%time
print(dog.lch_similarity(cat))  # doctest: +ELLIPSIS
print(hit.lch_similarity(slap))  # doctest: +ELLIPSIS
print(wn.lch_similarity(hit, slap))  # doctest: +ELLIPSIS
print(hit.lch_similarity(slap, simulate_root=False))
print(wn.lch_similarity(hit, slap, simulate_root=False))

2.0281482472922856
1.3121863889661687
1.3121863889661687
None
None
CPU times: user 809 µs, sys: 221 µs, total: 1.03 ms
Wall time: 826 µs


In [12]:
%%time
print(dog.wup_similarity(cat))  # doctest: +ELLIPSIS
print(hit.wup_similarity(slap))  
print(wn.wup_similarity(hit, slap)) 
print(hit.wup_similarity(slap, simulate_root=False))
print(wn.wup_similarity(hit, slap, simulate_root=False))

0.8571428571428571
0.25
0.25
None
None
CPU times: user 1.55 ms, sys: 274 µs, total: 1.83 ms
Wall time: 1.61 ms


wordnet_ic Information Content: Load an information content file from the wordnet_ic corpus.

In [14]:
import nltk
nltk.download('wordnet_ic')

[nltk_data] Downloading package wordnet_ic to /Users/k/nltk_data...
[nltk_data]   Unzipping corpora/wordnet_ic.zip.


True

In [15]:
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
# Or you can create an information content dictionary from a corpus (or anything that has a words() method).
from nltk.corpus import genesis
genesis_ic = wn.ic(genesis, False, 0.0)

synset1.res_similarity(synset2, ic): Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). Note that for any similarity measure that uses information content, the result is dependent on the corpus used to generate the information content and the specifics of how the information content was created.

In [16]:
print(dog.res_similarity(cat, brown_ic))  # doctest: +ELLIPSIS
print(dog.res_similarity(cat, genesis_ic))  # doctest: +ELLIPSIS

7.911666509036577
7.204023991374837


synset1.jcn_similarity(synset2, ic): Jiang-Conrath Similarity Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).

In [17]:
print(dog.jcn_similarity(cat, brown_ic))  # doctest: +ELLIPSIS
print(dog.jcn_similarity(cat, genesis_ic))  # doctest: +ELLIPSIS

0.4497755285516739
0.28539390848096946


synset1.lin_similarity(synset2, ic): Lin Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

In [18]:
dog.lin_similarity(cat, semcor_ic)  # doctest: +ELLIPSIS

0.8863288628086228

In [22]:
test_data = []
path = './test_data.txt' #
with open(path,'r',encoding = 'utf-8') as f:
    for line in f.readlines():
        word1,word2,sim = line.split(',') #sim的值为1-5之间
        sim = float(sim)
        test_data.append((word1,word2,sim))

In [23]:
print(len(test_data))

771


In [24]:
print(test_data[0])

('access', 'gateway', 3.791666667)


In [37]:
print(wn.synsets('customers'))

[Synset('customer.n.01')]


In [38]:
for synset in wn.synsets('customers'):
    print(synset.pos())

n


In [36]:
print(wn.synset('customers.n.01'))

WordNetError: no lemma 'customers' with part of speech 'n'

In [45]:
import numpy as np
def cal_sim_based_wordnet(test_data):
    result = []
    for sample in test_data:
        sim = -1
        word1,word2,_ = sample
        word1_synsets = wn.synsets(word1)
        word2_synsets = wn.synsets(word2)
        for word1_synset in word1_synsets:
            for word2_synset in word2_synsets:
                if word1_synset.pos() == word2_synset.pos(): #词性相同
                    try:
                        sim = max(sim,word1_synset.lch_similarity(word2_synset))
                    except Exception as e:
                        pass
#                         print(word1_synset,'\t',word2_synset)
#                         print(e)   
        result.append((word1,word2,sim))
    sims = list(map(lambda x : x[2],result))
    print(np.mean(sims))
    print(np.std(sims))
    return result

In [46]:
results = cal_sim_based_wordnet(test_data)

2.7270948929057512
0.5349678262208599


In [47]:
for sample in results:
    print(sample)

('access', 'gateway', 2.538973871058276)
('account', 'explanation', 3.6375861597263857)
('account', 'invoice', 3.6375861597263857)
('account', 'statement', 2.9444389791664407)
('acoustic', 'remedy', 2.9444389791664407)
('acrylic', 'cloth', 2.9444389791664407)
('action', 'adjustment', 2.538973871058276)
('action', 'entrance', 2.538973871058276)
('activity', 'event', 2.538973871058276)
('activity', 'music', 2.9444389791664407)
('activity', 'skiing', 2.2512917986064953)
('addition', 'segment', 2.2512917986064953)
('adhesive', 'glue', 2.538973871058276)
('adult', 'dentist', 2.0281482472922856)
('adult', 'doctor', 2.0281482472922856)
('afternoon', 'substance', 2.2512917986064953)
('age', 'childhood', 2.538973871058276)
('agency', 'army', 2.9444389791664407)
('agency', 'office', 3.6375861597263857)
('agency', 'police', 2.538973871058276)
('agent', 'spy', 2.538973871058276)
('agreement', 'contract', 2.538973871058276)
('aim', 'purpose', 3.6375861597263857)
('aircraft', 'balloon', 2.5389738710

In [49]:
sim_based_on_wordnet = list(map(lambda x : x[2],results))
sim_human = list(map(lambda x : x[2],test_data))
from scipy import stats
rho, pval = stats.spearmanr(sim_based_on_wordnet,sim_human)

In [50]:
print(rho)

0.49604190501569656


https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html
https://www.cnblogs.com/zhangchaoyang/articles/2631907.html   
https://blog.csdn.net/wsywl/article/details/5727327

In [53]:
def cal_path_sim_based_wordnet(test_data):
    result = []
    for sample in test_data:
        sim = -1
        word1,word2,_ = sample
        word1_synsets = wn.synsets(word1)
        word2_synsets = wn.synsets(word2)
        for word1_synset in word1_synsets:
            for word2_synset in word2_synsets:
                if word1_synset.pos() == word2_synset.pos(): #词性相同
                    try:
                        sim = max(sim,word1_synset.path_similarity(word2_synset))
                    except Exception as e:
                        pass
#                         print(word1_synset,'\t',word2_synset)
#                         print(e)   
        result.append((word1,word2,sim))
    sims = list(map(lambda x : x[2],result))
    print('mean:',np.mean(sims))
    print('std:',np.std(sims))
    return result

results = cal_path_sim_based_wordnet(test_data)
path_sim_based_on_wordnet = list(map(lambda x : x[2],results))
sim_human = list(map(lambda x : x[2],test_data))
rho, pval = stats.spearmanr(path_sim_based_on_wordnet,sim_human)
print(rho)

mean: 0.4658680705373312
std: 0.25191968840001233
0.49849248829295956


https://zhuanlan.zhihu.com/p/24961011
写的还行，不过有大段重复。这里面有个错误。其实，size应该是词向量的维度。

In [57]:
from gensim.models import word2vec

In [58]:
%%time
sentences = word2vec.Text8Corpus('./text8.txt') #经过了分词处理

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 13.8 µs


In [59]:
%%time
model = word2vec.Word2Vec(sentences, size=200,min_count=1)

CPU times: user 3min 54s, sys: 5.12 s, total: 4min
Wall time: 1min 51s


In [61]:
model.similarity('woman', 'man')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.69430226

In [60]:
def cal_sim_based_on_word2vec(test_data):
    result = []
    for sample in test_data:
        word1,word2,_ = sample
        sim = model.similarity(word1, word2)
        result.append((word1,word2,sim))
    sims = list(map(lambda x : x[2],result))
    print('mean:',np.mean(sims))
    print('std:',np.std(sims))
    return result

In [62]:
results = cal_sim_based_on_word2vec(test_data)
sim_based_on_word2vec = list(map(lambda x : x[2],results))
sim_human = list(map(lambda x : x[2],test_data))
rho, pval = stats.spearmanr(sim_based_on_word2vec,sim_human)
print(rho)

mean: 0.34025797
std: 0.21585985
0.5033343211737935


  """
  if np.issubdtype(vec.dtype, np.int):


也可以使用Freebase naming预训练好的词向量。这里先用GoogleNews-vectors-negative300.bin.gz，见https://code.google.com/archive/p/word2vec/   
https://github.com/xgli/word2vec-api#where-to-get-a-pretrained-model   
http://www.voidcn.com/article/p-weoxyjng-brp.html   
https://lonepatient.top/2018/02/12/pretrained%20word%20vector%20model%20python.html


GoogleNews-vectors-negative300.bin.gz过大，使用gitignore不上传github.(载入的时候使用bin格式，要解压一下)


In [None]:
EMBEDDING_FILE = './GoogleNews-vectors-negative300.bin'
word2vecDict = word2vec.KeyedVectors.load_word2vec_format(EMBEDDING_FILE,binary=True)
embed_size = 300
embedding = dict()
# 词与对应词向量
for word in word2vecDict.wv.vocab:
    embedding[word] = word2vecDict.word_vec(word) # 对应的(300,)的词向量
print('Load %s word vectors.' % len(embedding))

In [65]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity([[1,2]],[[1,0]])) #矩阵运算。

[[0.4472136]]


In [66]:
model.similarity('woman', 'man')
#https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.BaseKeyedVectors.similarity

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.69430226

https://radimrehurek.com/gensim/models/word2vec.html

In [68]:
print(model.wv['woman'])

[ 1.0936182e+00 -5.2764189e-01  3.8536820e-01  4.6905097e-01
 -1.2095776e+00  1.9831616e-01 -9.4056559e-01  5.1411468e-01
 -6.0975820e-01  3.9441466e-01  1.0875509e+00 -2.3040097e+00
  6.3641429e-01 -5.0952858e-01  1.2991109e+00 -8.4646088e-01
 -2.2540779e+00 -1.5012826e-01  2.9543605e-01  1.4910125e+00
 -4.8979050e-01  2.6586127e-01  4.9781594e-01 -5.5228263e-01
  1.9406296e+00  1.0369748e+00  2.1361454e+00 -9.2265499e-01
 -1.9215186e+00 -1.0984342e+00  7.6154262e-01  8.2822680e-01
 -6.5015036e-01 -5.7646668e-01  9.3752187e-01  1.2917703e+00
  1.4428542e-03 -1.8448628e+00  1.4664515e+00  2.6143986e-01
  1.6343244e+00  1.5308477e+00  1.1392382e+00  2.7018315e-01
  1.3911008e+00 -3.6158633e-01  1.3006872e+00 -2.9908037e-01
 -1.7434940e+00  1.1519655e+00  4.7431079e-01  7.6003575e-01
 -4.5718226e-01 -1.3961911e+00  9.4889516e-01 -2.9400763e-01
  8.0086142e-01 -8.0007684e-01 -4.0882677e-01 -8.5615814e-01
 -5.0224245e-01 -1.2404286e+00  1.9735326e+00 -9.5109832e-01
  5.6321853e-01 -4.29346

In [69]:
print(cosine_similarity([model.wv['woman']],[model.wv['man']]))

[[0.69430226]]


In [70]:
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')

In [71]:
dog.res_similarity(cat, brown_ic)

7.911666509036577