In [1]:
import nltk
from nltk.corpus import wordnet as wn
nltk.data.path.append('../data')  # noqa

all_synsets = wn.all_synsets(pos=wn.NOUN)
all_synsets = sorted(all_synsets, key=lambda x: len(x.definition()))
print(f'Read {len(all_synsets)} synsets.')

Read 82115 synsets.


In [2]:
from tokenization import get_tokenizer, encode
print('Encoding...')
tokenizer = get_tokenizer('sentence-transformers/all-MiniLM-L12-v2')
tokenized_synsets = [(synset, encode(tokenizer, f'{synset.lemmas()[0].name()}, {synset.definition()}')) for synset in all_synsets]
print(f'Done.')

Encoding...
Done.


In [7]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

DEVICE = 'cpu'

cfn = (
    lambda xs:
    ([s for s, _ in xs],
     pad_sequence([torch.tensor(x) for _, x in xs], batch_first=True, padding_value=0).to(DEVICE)
     )
)

synset_dl = DataLoader(
    tokenized_synsets,
    shuffle=False,
    batch_size=1024,
    collate_fn=cfn
)

from model_wrappers import SBERT
from tqdm.notebook import tqdm

vectorizer = SBERT().to(DEVICE)
vectorizer.eval()

print('Vectorizing synsets...')
with torch.no_grad():
    synset_vectors = [
        (synset.name(), vector)
        for synsets, xs in tqdm(synset_dl)
        for synset, vector in zip(synsets, vectorizer(xs).cpu())
    ]

Vectorizing synsets...




  0%|          | 0/81 [00:00<?, ?it/s][A[A

  1%|          | 1/81 [00:03<04:06,  3.08s/it][A[A

  2%|▏         | 2/81 [00:05<03:44,  2.84s/it][A[A

  4%|▎         | 3/81 [00:08<03:55,  3.02s/it][A[A

  5%|▍         | 4/81 [00:12<04:13,  3.29s/it][A[A

  6%|▌         | 5/81 [00:16<04:32,  3.59s/it][A[A

  7%|▋         | 6/81 [00:20<04:28,  3.58s/it][A[A

  9%|▊         | 7/81 [00:23<04:17,  3.48s/it][A[A

 10%|▉         | 8/81 [00:27<04:18,  3.54s/it][A[A

 11%|█         | 9/81 [00:30<04:09,  3.47s/it][A[A

 12%|█▏        | 10/81 [00:34<04:04,  3.45s/it][A[A

 14%|█▎        | 11/81 [00:37<03:58,  3.40s/it][A[A

 15%|█▍        | 12/81 [00:40<03:59,  3.47s/it][A[A

 16%|█▌        | 13/81 [00:44<04:00,  3.53s/it][A[A

 17%|█▋        | 14/81 [00:47<03:49,  3.42s/it][A[A

 19%|█▊        | 15/81 [00:51<03:44,  3.40s/it][A[A

 20%|█▉        | 16/81 [00:54<03:43,  3.44s/it][A[A

 21%|██        | 17/81 [00:58<03:56,  3.70s/it][A[A

 22%|██▏       | 18/81 [01

In [8]:
import pickle

with open('../data/tokenized.p', 'rb') as f:
    (tokenized_nominos, _) = pickle.load(f)
    tokenized_nominos = [(x, y) for x, y, _ in tokenized_nominos]
print(f'Read {len(tokenized_nominos)} nominos.')

nomino_dl = DataLoader(
    tokenized_nominos,
    shuffle=False,
    batch_size=1024,
    collate_fn=cfn
)

print('Vectorizing nominos...')
with torch.no_grad():
    nomino_vectors = [
        (nomino, vector)
        for nominos, xs in tqdm(nomino_dl)
        for nomino, vector in zip(nominos, vectorizer(xs).cpu())
    ]

Read 6192 nominos.
Vectorizing nominos...




  0%|          | 0/7 [00:00<?, ?it/s][A[A

 14%|█▍        | 1/7 [00:04<00:26,  4.48s/it][A[A

 29%|██▊       | 2/7 [00:11<00:30,  6.12s/it][A[A

 43%|████▎     | 3/7 [00:17<00:23,  5.88s/it][A[A

 57%|█████▋    | 4/7 [00:21<00:16,  5.38s/it][A[A

 71%|███████▏  | 5/7 [00:27<00:10,  5.42s/it][A[A

 86%|████████▌ | 6/7 [00:31<00:05,  5.05s/it][A[A

100%|██████████| 7/7 [00:31<00:00,  4.57s/it][A[A


In [9]:
similarities = torch.stack([x for _, x in nomino_vectors]) @ torch.stack([x for _, x in synset_vectors]).t()


with open('../data/sim_matrix.p', 'wb') as f:
    pickle.dump(([n for n, _ in nomino_vectors], [s for s, _ in synset_vectors], similarities), f)
