In [97]:
import json
import math
import time

import pandas as pd
import plwn
import networkx as nx
from networkx.algorithms.cycles import simple_cycles
from networkx.algorithms.lowest_common_ancestors import lowest_common_ancestor
from networkx.algorithms.minors import contracted_nodes
from tqdm import tqdm


tqdm.pandas()

## Load wordnet to graph

In [3]:
# plwn.download()

In [6]:
wn = plwn.load('./data/default_model')

In [7]:
#wn.to_graphml(out_file='./data/graph_synset.xml', graph_type='synset')

In [8]:
G = nx.read_graphml('./data/graph_synset.xml')

## Describe graph

In [8]:
print(f'Nodes: {len(G.nodes)}')
print(f'Edges: {len(G.edges)}')

Nodes: 340647
Edges: 1438540


## Load SimLex999 dataset

In [11]:
simlex = pd.read_csv('./data/MSimLex999_Polish.txt', sep='\t', header=None)
simlex.columns = ['id', 'word1', 'word2', 'similarity', 'relatedness']
simlex

Unnamed: 0,id,word1,word2,similarity,relatedness
0,1,stary,nowy,0.43,7.29
1,2,bystry,inteligentny,8.86,9.71
2,3,ciężki,trudny,4.86,7.29
3,4,szczęśliwy,radosny,8.14,8.86
4,5,łatwy,męczący,0.43,6.43
...,...,...,...,...,...
994,995,dołączyć,zdobyć,0.43,2.29
995,996,wysyłać,uczestniczyć,0.00,0.86
996,997,zbierać,uczestniczyć,0.00,0.71
997,998,pochłonąć,wycofać,0.00,0.57


## Create word: id json

In [12]:
unique_words = set(list(simlex['word1'].unique()) + list(simlex['word2'].unique()))

len(list(unique_words))

1139

In [32]:
# ids_dict = {}
# for word in tqdm(unique_words):
#     synset_id = None
#     try:
#         synset_id = wn.synset(word, plwn.PoS.noun, 1).to_dict()['id']
#     except plwn.exceptions.SynsetNotFound:
#         pass
#     if not synset_id:
#         try:
#             synset_id = wn.synset(word, plwn.PoS.verb, 1).to_dict()['id']
#         except plwn.exceptions.SynsetNotFound:
#             pass
#     if not synset_id:
#         try:
#             synset_id = wn.synset(word, plwn.PoS.adjective, 1).to_dict()['id']
#         except plwn.exceptions.SynsetNotFound:
#             pass
#     if not synset_id:
#         try:
#             synset_id = wn.synset(word, plwn.PoS.adverb, 1).to_dict()['id']
#         except plwn.exceptions.SynsetNotFound:
#             pass
#     
#     if synset_id:
#         ids_dict[word] = synset_id

100%|██████████| 1139/1139 [09:01<00:00,  2.10it/s]


In [47]:
#with open('data/ids_dict.json', 'w', encoding='utf8') as f:
#    json.dump(ids_dict, f, ensure_ascii=False)

In [14]:
with open('data/ids_dict.json', 'r', encoding='utf8') as f:
    ids_dict = json.load(f)

In [15]:
len(ids_dict)

1103

## Create subgraph - graph filtering by 'hiponimia'

In [64]:
filtered_edges = [edge for edge in tqdm(list(G.edges)) if edge[2].endswith('hiperonimia')] # if (edge[2].endswith('hiperonimia')) or (edge[2].endswith('hiponimia'))]

100%|██████████| 1438540/1438540 [00:00<00:00, 1720887.95it/s]


In [65]:
subG = nx.DiGraph()
for edge in tqdm(filtered_edges):
    subG.add_edge(edge[0], edge[1], label=edge[2])

100%|██████████| 204667/204667 [00:00<00:00, 218170.04it/s]


In [66]:
print(f'Nodes: {len(subG.nodes)} - part of G: {round(len(subG.nodes)/len(G.nodes), 5)}')
print(f'Edges: {len(subG.edges)} - part of G: {round(len(subG.edges)/len(G.edges), 5)}')

Nodes: 173726 - part of G: 0.50999
Edges: 204667 - part of G: 0.14227


## Removing cycles

In [67]:
cycles = list(simple_cycles(subG))

for cycle in cycles:
    print('\nCYCLE')
    print('ID \t WORD \t\tIN_SIMLEX')
    for node in cycle:
        word = wn.synset_by_id(node).to_dict()['units'][0]['lemma']
        in_simlex = int(node) in list(ids_dict.values())
        print(f'{node} \t {word}  \t{in_simlex}')


CYCLE
ID 	 WORD 		IN_SIMLEX
2373 	 podjąć  	False
2497 	 zająć się  	False
2355 	 zrobić  	True
44782 	 zacząć  	False

CYCLE
ID 	 WORD 		IN_SIMLEX
2496 	 zajmować się  	False
55305 	 robić  	True
2367 	 podejmować  	False


In [70]:
true_node = '2355'
false_nodes = ['2373', '44782', '2497']
for false_node in false_nodes:
    subG = contracted_nodes(subG, true_node, false_node, self_loops=False)

true_node = '55305'
false_nodes = ['2496', '2367']
for false_node in false_nodes:
    subG = contracted_nodes(subG, true_node, false_node, self_loops=False)

In [71]:
len(list(simple_cycles(subG)))

0

## Adding main root node

In [72]:
root_nodes = [k for (k, v) in subG.in_degree() if v == 0]
print(len(root_nodes))

for node in root_nodes:
    subG.add_edge('root', node)

root_nodes = [k for (k, v) in subG.in_degree() if v == 0]
print(len(root_nodes))

4114
1


## Wu and Palmer’s Conceptual Similarity

In [78]:
def wu_palmer(node_1, node_2):
    lso = lowest_common_ancestor(subG, node_1, node_2)

    a = 2 * len(nx.shortest_path(subG, 'root', lso))
    b = len(nx.shortest_path(subG.to_undirected(), node_1, lso)) + len(nx.shortest_path(subG.to_undirected(), node_2, lso)) + a

    return a/b

In [79]:
# TEST
wu_palmer('262143', '1726')

0.8

## Leacock and Chodorow’s Normalized Path Length

In [80]:
nodes_depth = [(node, len(nx.shortest_path(subG, 'root', node))) for node in tqdm(subG.nodes())]
nodes_depth.sort(key=lambda el: el[1])

100%|██████████| 173722/173722 [03:22<00:00, 858.54it/s]


In [82]:
max_depth = nodes_depth[-1][1]

max_depth

28

In [88]:
def lc_normalized_path(node_1, node_2):
    a = len(nx.shortest_path(subG.to_undirected(), node_1, node_2))
    b = 2 * max_depth

    return math.log(a/b) * -1

In [89]:
# TEST
lc_normalized_path('262143', '1726')

3.332204510175204

## Counting similarities and addig to dataframe

In [91]:
words_in_graph = set(ids_dict.keys())
filtered_simlex = simlex[(simlex['word1'].isin(words_in_graph)) & (simlex['word2'].isin(words_in_graph))]

In [92]:
def add_wp(simlex_row):
    node_1 = str(ids_dict[simlex_row['word1']])
    node_2 = str(ids_dict[simlex_row['word2']])

    try:
        simlex_row['wu_palmer'] = wu_palmer(node_1, node_2)
    except Exception as e:
        print(e)
        simlex_row['wu_palmer'] = None

    return simlex_row


def add_lcn(simlex_row):
    node_1 = str(ids_dict[simlex_row['word1']])
    node_2 = str(ids_dict[simlex_row['word2']])

    try:
        simlex_row['leacon'] = lc_normalized_path(node_1, node_2)
    except Exception as e:
        print(e)
        simlex_row['leacon'] = None

    return simlex_row

In [None]:
filtered_simlex = filtered_simlex.progress_apply(add_wp, axis=1)

In [None]:
filtered_simlex = filtered_simlex.progress_apply(add_lcn, axis=1)

In [95]:
filtered_simlex = filtered_simlex.dropna().reset_index(inplace=False)

In [96]:
filtered_simlex.to_csv('out/wordnet_reslts.csv')

In [61]:
lex = wn.lexical_unit('pies', plwn.PoS.noun_pl, 1)
print(lex)
print('\n')
print(lex.definition)
print('\n')
print(lex.related())
print('\n')
print(lex.to_dict())

pies.1(21:zw)


ssak z rodziny psowatych.


()


{'id': 5563, 'lemma': 'pies', 'pos': 'noun', 'variant': 1, 'definition': 'ssak z rodziny psowatych.', 'sense_examples': ('Zmysł powonienia lisów jest dobry, lecz słabszy od węchu innych psów.',), 'sense_examples_sources': ('P',), 'external_links': (), 'usage_notes': ('specj.',), 'domain': 'zwierzęta', 'synset': 5194, 'verb_aspect': None, 'emotion_markedness': None, 'emotion_names': (), 'emotion_valuations': (), 'emotion_example': None, 'emotion_example_secondary': None, 'str': 'pies.1(21:zw)', 'related': {}}


In [62]:
synset = wn.synset('pies', plwn.PoS.noun_pl, 1)
print(synset.to_dict())
print('\n')
print(synset.related_pairs)

{'id': 5194, 'definition': '', 'is_artificial': False, 'units': ({'id': 5563, 'lemma': 'pies', 'pos': 'noun', 'variant': 1, 'definition': 'ssak z rodziny psowatych.', 'sense_examples': ('Zmysł powonienia lisów jest dobry, lecz słabszy od węchu innych psów.',), 'sense_examples_sources': ('P',), 'external_links': (), 'usage_notes': ('specj.',), 'domain': 'zwierzęta', 'synset': 5194, 'verb_aspect': None, 'emotion_markedness': None, 'emotion_names': (), 'emotion_valuations': (), 'emotion_example': None, 'emotion_example_secondary': None, 'str': 'pies.1(21:zw)', 'related': {}},), 'str': '{pies.1(21:zw)}', 'related': {'egzemplarz': ((72555, '{Cerber.1(21:zw)}'),), 'fuzzynimia_synsetów': ((80494, '{dogoterapia.1(2:czy), [+ 1 unit(s)]}'), (7061418, '{kabanos.2(10:jedz), [+ 1 unit(s)]}')), 'hiperonimia': ((256921, '{wilk_indyjski.1(21:zw)}'), (394589, '{pies.4(21:zw)}'), (34120, '{likaon.1(21:zw), [+ 1 unit(s)]}'), (256922, '{wilk_rudy.1(21:zw), [+ 1 unit(s)]}'), (27423, '{kojot.1(21:zw), [+ 2 