In [1]:
import nltk

#下载wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from nltk.corpus import wordnet

#一个单词往往有多个意思
#单词.词性.词义序号
wordnet.synsets('girl')

[Synset('girl.n.01'),
 Synset('female_child.n.01'),
 Synset('daughter.n.01'),
 Synset('girlfriend.n.02'),
 Synset('girl.n.05')]

In [3]:
#所有词性
print(wordnet.ADJ)
print(wordnet.ADJ_SAT)
print(wordnet.ADV)
print(wordnet.NOUN)
print(wordnet.VERB)

a
s
r
n
v


In [4]:
#指定词性
wordnet.synsets('dog', pos='v')

[Synset('chase.v.01')]

In [5]:
#查看每个词的意思
print(wordnet.synset('girl.n.01').definition())

for i in wordnet.synsets('girl'):
    print(i, i.definition())

a young woman
Synset('girl.n.01') a young woman
Synset('female_child.n.01') a youthful female person
Synset('daughter.n.01') a female human offspring
Synset('girlfriend.n.02') a girl or young woman with whom a man is romantically involved
Synset('girl.n.05') a friendly informal reference to a grown woman


In [6]:
#根据意思找词
wordnet.synset('girl.n.01').lemma_names(), wordnet.synset('girl.n.01').lemmas()

(['girl', 'miss', 'missy', 'young_lady', 'young_woman', 'fille'],
 [Lemma('girl.n.01.girl'),
  Lemma('girl.n.01.miss'),
  Lemma('girl.n.01.missy'),
  Lemma('girl.n.01.young_lady'),
  Lemma('girl.n.01.young_woman'),
  Lemma('girl.n.01.fille')])

In [7]:
#找一个词的上位词
wordnet.synset('girl.n.01').hypernyms()

[Synset('woman.n.01')]

In [8]:
#找一个词的下位词
wordnet.synset('girl.n.01').hyponyms()

[Synset('baby.n.05'),
 Synset('belle.n.01'),
 Synset('bimbo.n.01'),
 Synset('chachka.n.01'),
 Synset('chit.n.01'),
 Synset('colleen.n.01'),
 Synset('dame.n.01'),
 Synset('flapper.n.01'),
 Synset('gal.n.03'),
 Synset('gamine.n.02'),
 Synset('gibson_girl.n.01'),
 Synset('lass.n.01'),
 Synset('maid.n.02'),
 Synset('may_queen.n.01'),
 Synset('mill-girl.n.01'),
 Synset('party_girl.n.01'),
 Synset('peri.n.01'),
 Synset('ring_girl.n.01'),
 Synset('rosebud.n.02'),
 Synset('sex_kitten.n.01'),
 Synset('shop_girl.n.01'),
 Synset('soubrette.n.01'),
 Synset('sweater_girl.n.01'),
 Synset('tomboy.n.01'),
 Synset('valley_girl.n.01'),
 Synset('working_girl.n.01')]

In [9]:
#求两个词之间的相似度,0-1之间
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

dog.path_similarity(cat)

0.2

In [10]:
#求两个词的最低共同祖先
dog.lowest_common_hypernyms(cat)

[Synset('carnivore.n.01')]

In [11]:
#动词求蕴含词
wordnet.synset('walk.v.01').entailments()

[Synset('step.v.01')]

In [12]:
#求反义词
wordnet.lemma('hot.a.01.hot').antonyms()

[Lemma('cold.a.01.cold')]

In [13]:
#全体词表,取出主词,去重
nodes = list({i.name().split('.')[0] for i in wordnet.all_synsets()})

len(nodes), nodes[:15]

(86555,
 ['',
  'backblast',
  'kubrick',
  'puppy',
  'housebreaker',
  'by-and-by',
  'run-down',
  'formed',
  'sternpost',
  'lentibulariaceae',
  'culinary',
  'discocephali',
  'ironwood',
  'freestanding',
  'clinical'])

In [14]:
#全体可能的同义词
#all_lemma_names是无重复的
lemmas = list(wordnet.all_lemma_names())

len(lemmas), lemmas[:15]

(147306,
 ['.22-caliber',
  '.22-calibre',
  '.22_caliber',
  '.22_calibre',
  '.38-caliber',
  '.38-calibre',
  '.38_caliber',
  '.38_calibre',
  '.45-caliber',
  '.45-calibre',
  '.45_caliber',
  '.45_calibre',
  '0',
  '1',
  '10'])

In [15]:
#合并nodes和lemmas,其实nodes基本上是lemmas的子集,只多了4个词
nodes = list(set(nodes + lemmas))

len(nodes), nodes[:15]

(147311,
 ['',
  'hemimetabolism',
  'backblast',
  'robert_lowell',
  'russian_wolfhound',
  'kubrick',
  'board_of_directors',
  'subserviently',
  'ardea_herodius',
  'puppy',
  'housebreaker',
  'by-and-by',
  'captain_john_smith',
  'chromium-plate',
  'fritillaria_agrestis'])

In [16]:
#所有的关系
edges = [
    'hypernyms', 'hyponyms', 'instance_hyponyms', 'member_meronyms',
    'part_meronyms', 'topic_domains', 'usage_domains', 'region_domains',
    'attributes', 'entailments', 'causes', 'also_sees', 'verb_groups',
    'similar_tos', 'lemma_names'
]

In [17]:
#输出图谱
for node in wordnet.synsets('girl'):
    print('node=', node)
    for edge in edges:
        print('edge=', edge, getattr(node, edge)())

    print('---------------')

node= Synset('girl.n.01')
edge= hypernyms [Synset('woman.n.01')]
edge= hyponyms [Synset('baby.n.05'), Synset('belle.n.01'), Synset('bimbo.n.01'), Synset('chachka.n.01'), Synset('chit.n.01'), Synset('colleen.n.01'), Synset('dame.n.01'), Synset('flapper.n.01'), Synset('gal.n.03'), Synset('gamine.n.02'), Synset('gibson_girl.n.01'), Synset('lass.n.01'), Synset('maid.n.02'), Synset('may_queen.n.01'), Synset('mill-girl.n.01'), Synset('party_girl.n.01'), Synset('peri.n.01'), Synset('ring_girl.n.01'), Synset('rosebud.n.02'), Synset('sex_kitten.n.01'), Synset('shop_girl.n.01'), Synset('soubrette.n.01'), Synset('sweater_girl.n.01'), Synset('tomboy.n.01'), Synset('valley_girl.n.01'), Synset('working_girl.n.01')]
edge= instance_hyponyms []
edge= member_meronyms []
edge= part_meronyms []
edge= topic_domains []
edge= usage_domains []
edge= region_domains []
edge= attributes []
edge= entailments []
edge= causes []
edge= also_sees []
edge= verb_groups []
edge= similar_tos []
edge= lemma_names ['girl',

In [24]:
for lemma in wordnet.lemmas('girl'):
    print('lemma=', lemma)

    #只用于lemma的三种关系
    for edge in ['antonyms', 'derivationally_related_forms', 'pertainyms']:
        for target in getattr(lemma, edge)():
            print('edge=', edge, 'target=', target.name())

    print('---------------')

lemma= Lemma('girl.n.01.girl')
edge= derivationally_related_forms target= girlhood
---------------
lemma= Lemma('female_child.n.01.girl')
edge= antonyms target= boy
edge= derivationally_related_forms target= girlhood
---------------
lemma= Lemma('daughter.n.01.girl')
edge= antonyms target= boy
---------------
lemma= Lemma('girlfriend.n.02.girl')
---------------
lemma= Lemma('girl.n.05.girl')
---------------
