In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/direct_relations.csv')
data

Unnamed: 0,Definição_Synset,ID_Synset,Definição_Relacionada,ID_Relacionada,Relacao
0,an entity that has physical existence,physical_entity.n.01,that which is perceived or known or inferred t...,entity.n.01,Hypernyms
1,a general concept formed by extracting common ...,abstraction.n.06,that which is perceived or known or inferred t...,entity.n.01,Hypernyms
2,a separate and self-contained entity,thing.n.12,an entity that has physical existence,physical_entity.n.01,Hypernyms
3,a tangible and visible entity; an entity that ...,object.n.01,an entity that has physical existence,physical_entity.n.01,Hypernyms
4,an assemblage of parts that is regarded as a s...,whole.n.02,a tangible and visible entity; an entity that ...,object.n.01,Hypernyms
...,...,...,...,...,...
111271,become empty of water,run_dry.v.01,become dry or drier,dry.v.02,Hypernyms
111272,get foggy,fog_up.v.01,make overcast or cloudy,overcast.v.01,Hypernyms
111273,burn to charcoal,char.v.01,cause to burn or combust,burn.v.05,Hypernyms
111274,"become hazy, dull, or cloudy",haze.v.01,make overcast or cloudy,overcast.v.01,Hypernyms


In [3]:
data['Relacao'].value_counts()

Relacao
Hypernyms    89089
Holonyms     22187
Name: count, dtype: int64

In [3]:
entities = pd.concat([data['ID_Synset'], data['ID_Relacionada']]).unique()
print(entities.shape[0], entities)

mapping = {name : i for i, name in enumerate(entities)}

91294 ['physical_entity.n.01' 'abstraction.n.06' 'thing.n.12' ... 'burn.v.05'
 'absorb.v.06' 'blow.v.02']


In [4]:
definitions = pd.concat([data['Definição_Synset'],
                         data['Definição_Relacionada']]).unique()
definitions

array(['an entity that has physical existence',
       'a general concept formed by extracting common features from specific examples',
       'a separate and self-contained entity', ...,
       'cause to burn or combust', 'suck or take up or in',
       'be blowing or storming'], dtype=object)

In [5]:
edge_index_i =  data['ID_Synset'].map(mapping).to_numpy()
edge_index_j =  data['ID_Relacionada'].map(mapping).to_numpy()

edge_index = np.stack([edge_index_i, edge_index_j])
edge_index.shape

(2, 111276)

In [6]:
edge2id = {
    'Hypernyms': 0,
    'Holonyms': 1
}
edge_attr = data['Relacao'].map(edge2id).to_numpy()
edge_attr.shape

(111276,)

### Encoding definitions 

In [7]:
import torch
from transformers import BertTokenizer, BertConfig, BertModel

In [8]:
definitions

array(['an entity that has physical existence',
       'a general concept formed by extracting common features from specific examples',
       'a separate and self-contained entity', ...,
       'cause to burn or combust', 'suck or take up or in',
       'be blowing or storming'], dtype=object)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 

In [29]:
tokens = tokenizer(list(definitions), padding=True, return_tensors='pt')

In [30]:
tokens

{'input_ids': tensor([[  101,  2019,  9178,  ...,     0,     0,     0],
        [  101,  1037,  2236,  ...,     0,     0,     0],
        [  101,  1037,  3584,  ...,     0,     0,     0],
        ...,
        [  101,  3426,  2000,  ...,     0,     0,     0],
        [  101, 11891,  2030,  ...,     0,     0,     0],
        [  101,  2022, 11221,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [31]:
token_ids  = tokens.input_ids
token_mask = tokens.attention_mask
token_type_ids = tokens.token_type_ids

In [36]:
model = BertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [45]:

with torch.no_grad():
    encoded_defs = model(token_ids[:500], token_mask[:500], token_type_ids[:500])

In [48]:
ed = encoded_defs.last_hidden_state

In [49]:
ed.shape

torch.Size([500, 138, 768])

## PyG's Wordnet 

In [4]:
import torch 
from torch_geometric.datasets import WordNet18RR

wordnet = WordNet18RR(root='data/wordnet18rr', force_reload=True)

Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/WN18RR/original/train.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/WN18RR/original/valid.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/WN18RR/original/test.txt
Processing...
Done!


In [5]:
edge_index = wordnet.edge_index
edge_type  = wordnet.edge_type 
train_mask, val_mask, test_mask = wordnet.train_mask, wordnet.val_mask, wordnet.test_mask

In [47]:
edge_type.shape, edge_type

(torch.Size([93003]), tensor([ 3,  9, 10,  ...,  3,  3,  3]))

In [6]:
wordnet.get(0)

Data(edge_index=[2, 93003], edge_type=[93003], train_mask=[93003], val_mask=[93003], test_mask=[93003], num_nodes=40943)

In [40]:
wordnet.edge2id

{'_also_see': 0,
 '_derivationally_related_form': 1,
 '_has_part': 2,
 '_hypernym': 3,
 '_instance_hypernym': 4,
 '_member_meronym': 5,
 '_member_of_domain_region': 6,
 '_member_of_domain_usage': 7,
 '_similar_to': 8,
 '_synset_domain_topic_of': 9,
 '_verb_group': 10}