In [1]:
import json
import math
import time

import pandas as pd
import plwn
import networkx as nx
from networkx.algorithms.cycles import simple_cycles
from networkx.algorithms.lowest_common_ancestors import lowest_common_ancestor
from networkx.algorithms.minors import contracted_nodes
from tqdm import tqdm


tqdm.pandas()

## Load wordnet to graph

In [2]:
# plwn.download()

In [3]:
wn = plwn.load('./data/default_model')

In [4]:
#wn.to_graphml(out_file='./data/graph_synset.xml', graph_type='synset')

In [5]:
G = nx.read_graphml('./data/graph_synset.xml')

## Describe graph

In [6]:
print(f'Nodes: {len(G.nodes)}')
print(f'Edges: {len(G.edges)}')

Nodes: 340647
Edges: 1438540


## Load SimLex999 dataset

In [7]:
simlex = pd.read_csv('./data/MSimLex999_Polish.txt', sep='\t', header=None)
simlex.columns = ['id', 'word1', 'word2', 'similarity', 'relatedness']
simlex

Unnamed: 0,id,word1,word2,similarity,relatedness
0,1,stary,nowy,0.43,7.29
1,2,bystry,inteligentny,8.86,9.71
2,3,ciężki,trudny,4.86,7.29
3,4,szczęśliwy,radosny,8.14,8.86
4,5,łatwy,męczący,0.43,6.43
...,...,...,...,...,...
994,995,dołączyć,zdobyć,0.43,2.29
995,996,wysyłać,uczestniczyć,0.00,0.86
996,997,zbierać,uczestniczyć,0.00,0.71
997,998,pochłonąć,wycofać,0.00,0.57


## Create word: id json

In [8]:
unique_words = set(list(simlex['word1'].unique()) + list(simlex['word2'].unique()))

len(list(unique_words))

1139

In [9]:
# ids_dict = {}
# for word in tqdm(unique_words):
#     synset_id = None
#     try:
#         synset_id = wn.synset(word, plwn.PoS.noun, 1).to_dict()['id']
#     except plwn.exceptions.SynsetNotFound:
#         pass
#     if not synset_id:
#         try:
#             synset_id = wn.synset(word, plwn.PoS.verb, 1).to_dict()['id']
#         except plwn.exceptions.SynsetNotFound:
#             pass
#     if not synset_id:
#         try:
#             synset_id = wn.synset(word, plwn.PoS.adjective, 1).to_dict()['id']
#         except plwn.exceptions.SynsetNotFound:
#             pass
#     if not synset_id:
#         try:
#             synset_id = wn.synset(word, plwn.PoS.adverb, 1).to_dict()['id']
#         except plwn.exceptions.SynsetNotFound:
#             pass
#     
#     if synset_id:
#         ids_dict[word] = synset_id

In [10]:
#with open('data/ids_dict.json', 'w', encoding='utf8') as f:
#    json.dump(ids_dict, f, ensure_ascii=False)

In [11]:
with open('data/ids_dict.json', 'r', encoding='utf8') as f:
    ids_dict = json.load(f)

In [12]:
len(ids_dict)

1103

## Create subgraph - graph filtering by 'hiponimia'

In [13]:
filtered_edges = [edge for edge in tqdm(list(G.edges)) if edge[2].endswith('hiperonimia')] # if (edge[2].endswith('hiperonimia')) or (edge[2].endswith('hiponimia'))]

100%|██████████| 1438540/1438540 [00:01<00:00, 1299846.18it/s]


In [14]:
subG = nx.DiGraph()
for edge in tqdm(filtered_edges):
    subG.add_edge(edge[0], edge[1], label=edge[2])

100%|██████████| 204667/204667 [00:01<00:00, 147482.55it/s]


In [15]:
print(f'Nodes: {len(subG.nodes)} - part of G: {round(len(subG.nodes)/len(G.nodes), 5)}')
print(f'Edges: {len(subG.edges)} - part of G: {round(len(subG.edges)/len(G.edges), 5)}')

Nodes: 173726 - part of G: 0.50999
Edges: 204667 - part of G: 0.14227


## Removing cycles

In [16]:
cycles = list(simple_cycles(subG))

for cycle in cycles:
    print('\nCYCLE')
    print('ID \t WORD \t\tIN_SIMLEX')
    for node in cycle:
        word = wn.synset_by_id(node).to_dict()['units'][0]['lemma']
        in_simlex = int(node) in list(ids_dict.values())
        print(f'{node} \t {word}  \t{in_simlex}')


CYCLE
ID 	 WORD 		IN_SIMLEX
2497 	 zająć się  	False
2355 	 zrobić  	True
44782 	 zacząć  	False
2373 	 podjąć  	False

CYCLE
ID 	 WORD 		IN_SIMLEX
2367 	 podejmować  	False
2496 	 zajmować się  	False
55305 	 robić  	True


In [17]:
true_node = '2355'
false_nodes = ['2373', '44782', '2497']
for false_node in false_nodes:
    subG = contracted_nodes(subG, true_node, false_node, self_loops=False)

true_node = '55305'
false_nodes = ['2496', '2367']
for false_node in false_nodes:
    subG = contracted_nodes(subG, true_node, false_node, self_loops=False)

In [18]:
len(list(simple_cycles(subG)))

0

## Adding main root node

In [19]:
root_nodes = [k for (k, v) in subG.in_degree() if v == 0]
print(len(root_nodes))

for node in root_nodes:
    subG.add_edge('root', node)

root_nodes = [k for (k, v) in subG.in_degree() if v == 0]
print(len(root_nodes))

4114
1


## Wu and Palmer’s Conceptual Similarity

In [20]:
def wu_palmer(node_1, node_2):
    lso = lowest_common_ancestor(subG, node_1, node_2)

    a = 2 * len(nx.shortest_path(subG, 'root', lso))
    b = len(nx.shortest_path(subG.to_undirected(), node_1, lso)) + len(nx.shortest_path(subG.to_undirected(), node_2, lso)) + a

    return a/b

In [21]:
# TEST
wu_palmer('262143', '1726')

0.8

## Leacock and Chodorow’s Normalized Path Length

In [22]:
nodes_depth = [(node, len(nx.shortest_path(subG, 'root', node))) for node in tqdm(subG.nodes())]
nodes_depth.sort(key=lambda el: el[1])

100%|██████████| 173722/173722 [03:24<00:00, 849.71it/s]


In [23]:
max_depth = nodes_depth[-1][1]

max_depth

28

In [24]:
def lc_normalized_path(node_1, node_2):
    a = len(nx.shortest_path(subG.to_undirected(), node_1, node_2))
    b = 2 * max_depth

    return math.log(a/b) * -1

In [25]:
# TEST
lc_normalized_path('262143', '1726')

3.332204510175204

## Counting similarities and addig to dataframe

In [26]:
words_in_graph = set(ids_dict.keys())
filtered_simlex = simlex[(simlex['word1'].isin(words_in_graph)) & (simlex['word2'].isin(words_in_graph))]

In [27]:
def add_wp(simlex_row):
    node_1 = str(ids_dict[simlex_row['word1']])
    node_2 = str(ids_dict[simlex_row['word2']])

    try:
        simlex_row['wu_palmer'] = wu_palmer(node_1, node_2)
    except Exception as e:
        print(e)
        simlex_row['wu_palmer'] = None

    return simlex_row


def add_lcn(simlex_row):
    node_1 = str(ids_dict[simlex_row['word1']])
    node_2 = str(ids_dict[simlex_row['word2']])

    try:
        simlex_row['leacon'] = lc_normalized_path(node_1, node_2)
    except Exception as e:
        print(e)
        simlex_row['leacon'] = None

    return simlex_row

In [28]:
filtered_simlex = filtered_simlex.progress_apply(add_wp, axis=1)

  0%|          | 2/941 [00:06<47:39,  3.05s/it]The node 49781 is not in the digraph.
  3%|▎         | 27/941 [05:03<2:35:15, 10.19s/it]The node 49781 is not in the digraph.
 16%|█▌        | 148/941 [30:56<2:16:48, 10.35s/it]The node 47608 is not in the digraph.
 18%|█▊        | 165/941 [34:14<2:09:48, 10.04s/it]The node 6007 is not in the digraph.
 19%|█▉        | 182/941 [37:42<2:18:36, 10.96s/it]The node 6007 is not in the digraph.
 20%|█▉        | 184/941 [38:02<2:06:21, 10.02s/it]The node 245559 is not in the digraph.
 20%|██        | 189/941 [39:03<2:14:22, 10.72s/it]The node 7074317 is not in the digraph.
 28%|██▊       | 263/941 [54:37<1:54:28, 10.13s/it]The node 15636 is not in the digraph.
 29%|██▉       | 275/941 [56:57<1:53:06, 10.19s/it]The node 11412 is not in the digraph.
 36%|███▌      | 335/941 [1:08:43<1:40:13,  9.92s/it]The node 1418 is not in the digraph.
 40%|████      | 379/941 [1:17:13<1:31:38,  9.78s/it]The node 417999 is not in the digraph.
 43%|████▎     | 408/

In [29]:
filtered_simlex = filtered_simlex.progress_apply(add_lcn, axis=1)

  0%|          | 2/941 [00:02<16:17,  1.04s/it]Either source 49781 or target 235228 is not in G
  3%|▎         | 27/941 [01:04<37:21,  2.45s/it]Either source 49781 or target 9853 is not in G
 16%|█▌        | 148/941 [05:38<28:54,  2.19s/it]Either source 47608 or target 11641 is not in G
 18%|█▊        | 165/941 [06:16<28:19,  2.19s/it]Either source 65383 or target 6007 is not in G
 19%|█▉        | 182/941 [06:54<28:24,  2.25s/it]Either source 2854 or target 6007 is not in G
 20%|█▉        | 184/941 [06:59<29:36,  2.35s/it]Either source 245559 or target 5544 is not in G
 20%|██        | 189/941 [07:10<26:44,  2.13s/it]Either source 7074317 or target 17524 is not in G
 28%|██▊       | 263/941 [09:56<26:07,  2.31s/it]Either source 351 or target 15636 is not in G
 29%|██▉       | 275/941 [10:23<24:17,  2.19s/it]Either source 11412 or target 43485 is not in G
 36%|███▌      | 335/941 [12:29<20:28,  2.03s/it]Either source 12911 or target 1418 is not in G
 40%|████      | 379/941 [14:02<19:08

In [30]:
filtered_simlex = filtered_simlex.dropna().reset_index(inplace=False)

In [31]:
filtered_simlex.to_csv('out/wordnet_reslts.csv')