# WordNet

In [1]:
from nltk.corpus import wordnet as wn

`synsets()` function returns a set of synsets that represent the senses of a given word.

In [2]:
wn.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

Each synset has its own name. You can make the synset object by specifying the name.

In [3]:
wn.synset('car.n.01')

Synset('car.n.01')

A synset is a set of lemmas, which is the disambiguated form of word.

In [4]:
wn.synset('car.n.01').lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

Each lemma also has its own name so that you can get a lemma object by specifying the name.

In [5]:
wn.lemma('car.n.01.automobile')

Lemma('car.n.01.automobile')

A lemma is basically a pair of synset and word as a symbol name.

In [6]:
wn.lemma('car.n.01.automobile').synset()

Synset('car.n.01')

In [7]:
wn.lemma('car.n.01.automobile').name()

'automobile'

There is a convinient method `lemma_names()` that returns a list of lemma names of the synset.

In [8]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [9]:
[synset.lemma_names() for synset in wn.synsets('car')]

[['car', 'auto', 'automobile', 'machine', 'motorcar'],
 ['car', 'railcar', 'railway_car', 'railroad_car'],
 ['car', 'gondola'],
 ['car', 'elevator_car'],
 ['cable_car', 'car']]

WordNet is a dictionary. So each synset has a description of its definition and sometimes examples of usage.

In [10]:
wn.synset('car.n.01').definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [11]:
[synset.definition() for synset in wn.synsets('car')]

['a motor vehicle with four wheels; usually propelled by an internal combustion engine',
 'a wheeled vehicle adapted to the rails of railroad',
 'the compartment that is suspended from an airship and that carries personnel and the cargo and the power plant',
 'where passengers ride up and down',
 'a conveyance for passengers or freight on a cable railway']

In [12]:
[synset.examples() for synset in wn.synsets('car')]

[['he needs a car to get to work'],
 ['three cars had jumped the rails'],
 [],
 ['the car was on the top floor'],
 ['they took a cable car to the top of the mountain']]

## Hyponyms

In [13]:
motorcar = wn.synset('car.n.01')

In [14]:
types_of_motorcar = motorcar.hyponyms() 

In [15]:
types_of_motorcar[0]

Synset('ambulance.n.01')

In [16]:
[synset.lemma_names() for synset in types_of_motorcar]

[['ambulance'],
 ['beach_wagon',
  'station_wagon',
  'wagon',
  'estate_car',
  'beach_waggon',
  'station_waggon',
  'waggon'],
 ['bus', 'jalopy', 'heap'],
 ['cab', 'hack', 'taxi', 'taxicab'],
 ['compact', 'compact_car'],
 ['convertible'],
 ['coupe'],
 ['cruiser',
  'police_cruiser',
  'patrol_car',
  'police_car',
  'prowl_car',
  'squad_car'],
 ['electric', 'electric_automobile', 'electric_car'],
 ['gas_guzzler'],
 ['hardtop'],
 ['hatchback'],
 ['horseless_carriage'],
 ['hot_rod', 'hot-rod'],
 ['jeep', 'landrover'],
 ['limousine', 'limo'],
 ['loaner'],
 ['minicar'],
 ['minivan'],
 ['Model_T'],
 ['pace_car'],
 ['racer', 'race_car', 'racing_car'],
 ['roadster', 'runabout', 'two-seater'],
 ['sedan', 'saloon'],
 ['sport_utility', 'sport_utility_vehicle', 'S.U.V.', 'SUV'],
 ['sports_car', 'sport_car'],
 ['Stanley_Steamer'],
 ['stock_car'],
 ['subcompact', 'subcompact_car'],
 ['touring_car', 'phaeton', 'tourer'],
 ['used-car', 'secondhand_car']]

## Hypernyms

In [17]:
motorcar.hypernyms()

[Synset('motor_vehicle.n.01')]

In [18]:
paths = motorcar.hypernym_paths()

In [19]:
len(paths)

2

In [20]:
paths[0]

[Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('object.n.01'),
 Synset('whole.n.02'),
 Synset('artifact.n.01'),
 Synset('instrumentality.n.03'),
 Synset('container.n.01'),
 Synset('wheeled_vehicle.n.01'),
 Synset('self-propelled_vehicle.n.01'),
 Synset('motor_vehicle.n.01'),
 Synset('car.n.01')]

In [21]:
paths[1]

[Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('object.n.01'),
 Synset('whole.n.02'),
 Synset('artifact.n.01'),
 Synset('instrumentality.n.03'),
 Synset('conveyance.n.03'),
 Synset('vehicle.n.01'),
 Synset('wheeled_vehicle.n.01'),
 Synset('self-propelled_vehicle.n.01'),
 Synset('motor_vehicle.n.01'),
 Synset('car.n.01')]

In [22]:
motorcar.root_hypernyms()

[Synset('entity.n.01')]

## Meronyms and Holonyms

In [23]:
wn.synset('tree.n.01').part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [24]:
wn.synset('tree.n.01').substance_meronyms()

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

In [25]:
wn.synset('tree.n.01').member_holonyms()

[Synset('forest.n.01')]

## Entailments

In [26]:
wn.synset('walk.v.01').entailments()

[Synset('step.v.01')]

In [27]:
wn.synset('eat.v.01').entailments()

[Synset('chew.v.01'), Synset('swallow.v.01')]

## Antonyms

In [28]:
wn.lemma('supply.n.02.supply').antonyms()

[Lemma('demand.n.02.demand')]

In [29]:
wn.lemma('horizontal.a.01.horizontal').antonyms()

[Lemma('inclined.a.02.inclined'), Lemma('vertical.a.01.vertical')]

# Semantic Similarities based on WordNet

In [30]:
nickel = wn.synset('nickel.n.02')
dime = wn.synset('dime.n.01')
money = wn.synset('money.n.01')

In [31]:
nickel.lowest_common_hypernyms(dime)

[Synset('coin.n.01')]

In [32]:
nickel.lowest_common_hypernyms(money)

[Synset('medium_of_exchange.n.01')]

In [33]:
wn.synset('coin.n.01').min_depth()

8

In [34]:
wn.synset('medium_of_exchange.n.01').min_depth()

5

### Path length similarity

In [35]:
nickel.path_similarity(dime)

0.3333333333333333

In [36]:
nickel.path_similarity(money)

0.16666666666666666

### Resnik similarity

In [37]:
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')

In [38]:
nickel.res_similarity(dime, brown_ic)

7.455288045755159

In [39]:
nickel.res_similarity(money, brown_ic)

6.254931881899411

### Lin similarity

In [40]:
nickel.lin_similarity(dime, brown_ic)

0.6073758971525297

In [41]:
nickel.lin_similarity(money, brown_ic)

0.6191356626837683

# Co-occurrence Words

In [42]:
from nltk.corpus import brown
docs = [brown.words(fileid) for fileid in brown.fileids()]

In [43]:
import nltk
text = nltk.Text([w for doc in docs for w in doc])

In [44]:
text.concordance('republican')

Displaying 25 of 54 matches:
 year hailed the message . Senate Republican Leader Dirksen ( Ill. ) and House
Leader Dirksen ( Ill. ) and House Republican Leader Charles Halleck ( Ind. ) s
umental blunders '' in Cuba . One Republican senator told this correspondent t
R ) of Kentucky , chairman of the Republican National Committee , that the Ken
in both parties -- Democratic and Republican -- should divorce themselves and 
did not mention it when the three Republican gubernatorial candidates spoke at
 is the apparent intention of the Republican Party to campaign on the carcass 
wer retired . Now he's gone , the Republican Party is not going to be able to 
 but there is nothing left of the Republican Party without his leadership '' .
 he was proud to be an Eisenhower Republican `` and proud to have absorbed his
ate that Jones will be chosen the Republican Party's nominee with the largest 
tfield , as state chairman of the Republican Women for Jones Committee . Mrs. 
resident of the Westfie

In [45]:
text.concordance('democratic')

Displaying 25 of 109 matches:
and chairman of the Miller County Democratic Executive Committee . Davis recei
 conservatives in both parties -- Democratic and Republican -- should divorce 
pokesmen for the town's insurgent Democratic leadership speaking out against t
e a remark by Richard J. Hughes , Democratic gubernatorial candidate , that th
ay by Hughes at a Westfield Young Democratic Club cocktail party at the Scotch
eets , who is a candidate for the Democratic gubernatorial nomination , today 
s County debut in his bid for the Democratic gubernatorial nomination here las
state '' . He promised nearly 200 Democratic county committee members at the m
ughes asked , `` a representative Democratic vote in the primary for a springb
entment welled up yesterday among Democratic district leaders and some county 
 reaction among anti-organization Democratic leaders and in the Liberal party 
 bitter internal fight within the Democratic party that has been going on for 
hree years . The resen

In [46]:
text.common_contexts(['republican', 'democratic'])

house_leader the_national the_party the_organization's a_or


In [47]:
text.similar('republican')

democratic recent `` other first the . are two one new ( wife more
local good college minute thing positive


In [48]:
text.similar('money')

time it people one children way work life man action water war men
place house , them af day others


# Word Embeddings

Download "Pre-trained word and phrase vectors" (GoogleNews-vectors-negative300.bin.gz) that is available at the website below.
https://code.google.com/archive/p/word2vec/

In [49]:
from gensim.models import KeyedVectors

# Please change to your path
path_to_model = '/Users/kei/Downloads/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)

In [50]:
model.most_similar('money')

[('monies', 0.7165061235427856),
 ('funds', 0.7055202722549438),
 ('moneys', 0.6289056539535522),
 ('dollars', 0.628852367401123),
 ('cash', 0.6151220798492432),
 ('vast_sums', 0.6057385206222534),
 ('fund', 0.5789710283279419),
 ('Money', 0.5733489394187927),
 ('taxpayer_dollars', 0.5693670511245728),
 ('Monies', 0.5586516857147217)]

In [51]:
model.most_similar('Republican')

[('GOP', 0.879696249961853),
 ('Democratic', 0.8598263263702393),
 ('Democrat', 0.8078877329826355),
 ('Republicans', 0.773737907409668),
 ('Democrats', 0.7363811731338501),
 ('Re_publican', 0.678870439529419),
 ('Repub_lican', 0.6606504917144775),
 ('Repubican', 0.6556392908096313),
 ('Republian', 0.6437903642654419),
 ('Repub_licans', 0.6405728459358215)]

In [52]:
model.most_similar('Clinton')

[('Hillary_Clinton', 0.7631065845489502),
 ('Obama', 0.7526832222938538),
 ('Bill_Clinton', 0.7416832447052002),
 ('Hillary_Rodham_Clinton', 0.7254316806793213),
 ('Sen._Hillary_Clinton', 0.7086110711097717),
 ('Hillary', 0.6970474123954773),
 ('Senator_Hillary_Clinton', 0.6961780786514282),
 ('McCain', 0.6851686835289001),
 ('Clintons', 0.6733236908912659),
 ('Barack_Obama', 0.6713167428970337)]

In [53]:
model.most_similar(positive=['king', 'woman'], negative=['man'])

[('queen', 0.7118192315101624),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235946178436279),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087412595748901)]

In [54]:
model.most_similar(positive=['uncle', 'woman'], negative=['man'])

[('aunt', 0.802266538143158),
 ('mother', 0.7770731449127197),
 ('niece', 0.7684249877929688),
 ('father', 0.7237852811813354),
 ('grandmother', 0.722037136554718),
 ('daughter', 0.7185647487640381),
 ('sister', 0.7006257772445679),
 ('husband', 0.6982547044754028),
 ('granddaughter', 0.6858305335044861),
 ('nephew', 0.6710714101791382)]

In [55]:
model.most_similar(positive=['Tokyo', 'China'], negative=['Japan'])

[('Beijing', 0.8216202259063721),
 ('Shanghai', 0.795141875743866),
 ('Guangzhou', 0.6529653668403625),
 ('Beijng', 0.6465170383453369),
 ('Chinese', 0.6439487934112549),
 ('Shenzhen', 0.6439114809036255),
 ('Hong_Kong', 0.633777916431427),
 ('Taipei', 0.6317877769470215),
 ('Chongqing', 0.6239099502563477),
 ('Hangzhou', 0.6204276084899902)]

In [56]:
model.most_similar(positive=['Osaka', 'China'], negative=['Japan'])

[('Shanghai', 0.7343893051147461),
 ('Beijing', 0.7120296359062195),
 ('Guangzhou', 0.6772623062133789),
 ('Hangzhou', 0.6352313756942749),
 ('Tianjin', 0.6335029602050781),
 ('Chongqing', 0.6271936297416687),
 ('Shenzhen', 0.625232458114624),
 ('Beijng', 0.6206380724906921),
 ('Tongzhou', 0.619605302810669),
 ('Guo', 0.617931604385376)]