In [1]:
from nltk.corpus import wordnet

In [6]:
# import nltk
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anastasia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

# 1

In [23]:
word = wordnet.synsets('cape')
for s in word:
    print(s, s.definition())

Synset('cape.n.01') a strip of land projecting into a body of water
Synset('cape.n.02') a sleeveless garment like a cloak but shorter


# 2

In [25]:
print('land:', (word[0], word[0].definition()))
print('clothes:', (word[1], word[1].definition()))

land: (Synset('cape.n.01'), 'a strip of land projecting into a body of water')
clothes: (Synset('cape.n.02'), 'a sleeveless garment like a cloak but shorter')


# 3

In [27]:
from nltk.wsd import lesk

land = 'And this,’ he said, with a sweep of his arm that took in the stark capes and headlands, the fleets of islands and the sea around us, ‘this was my school room.'
clothes = 'He wears a grey t-shirt underneath a plaid flannel shirt and short grey cape , jeans, runners, and a plain black mask.'

land_tokens = [i.strip(',’.') for i in land.split()]
clothes_tokens = [i.strip('.,') for i in clothes.split()]

print(lesk(land_tokens, 'cape').definition())
print(lesk(clothes_tokens, 'cape').definition())

a strip of land projecting into a body of water
a sleeveless garment like a cloak but shorter


Алгоритм правильно распознал оба значения слова.

# 4

In [28]:
for m in word[0].hypernyms():
    print(m, m.definition())

Synset('land.n.04') the solid part of the earth's surface


In [29]:
for m in word[1].hypernyms():
    print(m, m.definition())

Synset('cloak.n.02') a loose outer garment


# 5

In [50]:
land = wordnet.synsets('land')
clothes = wordnet.synsets('clothes')

def get_dist_sim(ss1, lexeme):
    distances = []
    similarities = []
    for ss2 in lexeme:
        dist = ss1.shortest_path_distance(ss2)
        if dist is not None:
            distances.append(dist)
            sim = ss1.path_similarity(ss2)
            similarities.append(sim)
    return distances, similarities

dist1 = get_dist_sim(word[0], land)[0]
print('min d(cape: "мыс", land): {}'.format(min(dist1)))
print('closest lemma definition: {}\n'.format(land[dist1.index(min(dist1))].definition()))

dist2 = get_dist_sim(word[0], clothes)[0]
print('min d(cape: "мыс", clothes): {}'.format(min(dist2)))
print('closest lemma definition: {}\n'.format(clothes[dist2.index(min(dist2))].definition()))

dist3 = get_dist_sim(word[1], land)[0]
print('min d(cape: "кейп", land): {}'.format(min(dist3)))
print('closest lemma definition: {}\n'.format(land[dist3.index(min(dist3))].definition()))

dist4 = get_dist_sim(word[1], clothes)[0]
print('min d(cape: "кейп", clothes): {}'.format(min(dist4)))
print('closest lemma definition: {}\n'.format(clothes[dist4.index(min(dist4))].definition()))

print('min (d(cape: "мыс", land), d(cape: "мыс", clothes)): {}'.format(min(min(dist1), min(dist2))))
print('min (d(cape: "кейп", land), d(cape: "кейп", clothes)): {}'.format(min(min(dist3), min(dist4))))

min d(cape: "мыс", land): 1
closest lemma definition: the solid part of the earth's surface

min d(cape: "мыс", clothes): 7
closest lemma definition: clothing in general

min d(cape: "кейп", land): 9
closest lemma definition: material in the top layer of the surface of the earth in which plants can grow (especially with reference to its quality or use)

min d(cape: "кейп", clothes): 5
closest lemma definition: clothing in general

min (d(cape: "мыс", land), d(cape: "мыс", clothes)): 1
min (d(cape: "кейп", land), d(cape: "кейп", clothes)): 5


- Кратчайшее расстояние от мыса до land - 1; у ближайшей леммы land такое же определение, как и у cape в значении "мыс"
- Кратчайшее расстояние от мыса до clothes - 7
- Кратчайшее расстояние от кейпа до land - 9
- Кратчайшее расстояние от кейпа до clothes - 5 
- У леммы clothes одинаковое определение в случае ближайшего расположения относительно мыса и кейпа 
- Мыс ближе к land, кейп ближе к clothes (что логично)

# 6

Вычислить двумя разными способами расстояние: d(cape: "мыс", island) и d(organism, whole).

Leacock-Chodorow Similarity & Jiang-Conrath Similarity

In [52]:
island = wordnet.synsets('island')
for ss in island:
    print(ss, ss.definition())

Synset('island.n.01') a land mass (smaller than a continent) that is surrounded by water
Synset('island.n.02') a zone or area resembling an island


In [57]:
# nltk.download('wordnet_ic')

[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\Anastasia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet_ic.zip.


True

In [58]:
from nltk.corpus import wordnet_ic
ic = wordnet_ic.ic('ic-brown.dat')

In [60]:
print('Leacock-Chodorow Similarity', word[0].lch_similarity(island[0]))
print('Jiang-Conrath Similarity', word[0].jcn_similarity(island[0], ic=ic))

Leacock-Chodorow Similarity 2.538973871058276
Jiang-Conrath Similarity 0.20605589196622942


In [63]:
organism = wordnet.synsets('organism')
for ss in organism:
    print(ss, ss.definition())
    print(ss, ss.examples())
print()
whole = wordnet.synsets('whole', 'n')
for ss in whole:
    print(ss, ss.definition())
    print(ss, ss.examples())

Synset('organism.n.01') a living thing that has (or can develop) the ability to act or function independently
Synset('organism.n.01') []
Synset('organism.n.02') a system considered analogous in structure or function to a living body
Synset('organism.n.02') ['the social organism']

Synset('whole.n.01') all of something including all its component elements or parts
Synset('whole.n.01') ['Europe considered as a whole', 'the whole of American literature']
Synset('whole.n.02') an assemblage of parts that is regarded as a single entity
Synset('whole.n.02') ['how big is that part compared to the whole?', 'the team is a unit']


In [67]:
for ss1 in organism:
    print('"orginism" definition: ', ss1.definition())
    print()
    for ss2 in whole:
        print('"whole" definition: ', ss2.definition())
        print()
        print('Leacock-Chodorow Similarity: ', ss1.lch_similarity(ss2))
        print()
        print('Jiang-Conrath Similarity: ', ss1.jcn_similarity(ss2, ic=ic))

"orginism" definition:  a living thing that has (or can develop) the ability to act or function independently

"whole" definition:  all of something including all its component elements or parts

Leacock-Chodorow Similarity:  1.072636802264849

Jiang-Conrath Similarity:  0.11927453545173074
"whole" definition:  an assemblage of parts that is regarded as a single entity

Leacock-Chodorow Similarity:  2.538973871058276

Jiang-Conrath Similarity:  1.4444255924181877
"orginism" definition:  a system considered analogous in structure or function to a living body

"whole" definition:  all of something including all its component elements or parts

Leacock-Chodorow Similarity:  1.3350010667323402

Jiang-Conrath Similarity:  0.05928363701403374
"whole" definition:  an assemblage of parts that is regarded as a single entity

Leacock-Chodorow Similarity:  1.55814461804655

Jiang-Conrath Similarity:  0.07444639262208605


- По метрике Leacock-Chodorow Similarity "мыс" похож на "остров" меьше, чем "организм" на "целое" (во всех значениях) 
- С метрикой Jiang-Conrath Similarity примерно такая же картина, за исключением схожести "организма" и "целого" в значении "an assemblage of parts..." 
- Вероятно, такие результаты связаны с тем, что мной было не очень удачно подобрано второе слово ("остров"), и оно не включает в себя "мыс", а только близко к нему по значению