# Embedding Introspection

Gather statistical clues about both text and graph embeddings

In [1]:
import ryn
from ryn.text import data
from ryn.text import mapper
from ryn.embers import keen

import torch
import transformers as tf

dataset = 'oke.fb15k237_30061990_50'
kgc_model_name = 'DistMult-256-2020.08.12.120540.777006'
text_model_name = 'bert-base-cased'
text_dataset_name = f'{text_model_name}.200.768-small'

kgc_model = keen.Model.from_path(ryn.ENV.EMBER_DIR / dataset / kgc_model_name)
text_dataset = data.Dataset.load(path=(ryn.ENV.TEXT_DIR / 'data' / dataset / text_dataset_name))
text_tokenizer = data.Tokenizer(model=text_model_name)
text_encoder = tf.BertModel.from_pretrained(text_model_name, cache_dir=ryn.ENV.CACHE_DIR / 'lib.transformers')



In [3]:
E = kgc_model.keen.entity_embeddings.weight
print(f'shape: {E.shape}')
print(f'max: {E.max():2.3f} min: {E.min():2.3f}, avg: {E.mean():2.3f}, std: {E.std():2.3f}')

shape: torch.Size([9679, 256])
max: 0.478 min: -0.442, avg: -0.001, std: 0.062


In [4]:
import torch.utils.data as torch_data
text_mapped_dataset = mapper.Dataset(part=text_dataset.cw_train)
text_dataloader = torch_data.DataLoader(text_mapped_dataset, batch_size=100, collate_fn=mapper.collate_fn)

In [6]:
it = iter(text_dataloader)
sentences, entities = next(it)

mask = text_tokenizer.base.vocab['[MASK]']
attention_mask = (sentences > 0) | (sentences == mask)
attention_mask = attention_mask.to(dtype=torch.long)
Y = text_encoder(input_ids=sentences, attention_mask=attention_mask)[0]

In [7]:
print(f'shape: {Y.shape}')
print(f'max: {Y.max():2.3f} min: {Y.min():2.3f}, avg: {Y.mean():2.3f}, std: {Y.std():2.3f}')

shape: torch.Size([100, 207, 768])
max: 7.630 min: -9.840, avg: -0.002, std: 0.462
