In [1]:
import torch
from torch.utils.data import DataLoader

import time
import pickle
import os
import logging

from process_data import process_data
from model_fn import CBOWHierSoftmax
from input_fn import CBOWBibleDataset
from utils import set_logger

print(torch.__version__)

1.1.0


In [18]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

In [31]:
# WARNING! trashy code inside: skip it to the next cells

try:
    huffman_corpus, node_inxs, turns_inxs, leaves_hash_inversed, \
        vocab_size, extended_vocab_size = \
        pickle.load(open('/tmp/bible_dataset.pkl', 'rb'))
except FileNotFoundError:
    out = process_data('../', 5)
    # print(out)
    pickle.dump(out, open('/tmp/bible_dataset.pkl', 'wb'))
    huffman_corpus, node_inxs, turns_inxs, leaves_hash_inversed, \
        vocab_size, extended_vocab_size = out

nodes_count = extended_vocab_size

# with torch.cuda.device(device):
# with torch.
device = torch.device("cpu")
device = torch.device("cuda:0")
# device = None

batch_size = 128  # 1024*8
log_freq   = 100*8*2
lr = 0.1


st = time.time()
# with torch.cuda.device(device):
cbow_dataset = CBOWBibleDataset(huffman_corpus, node_inxs, turns_inxs,
                                vocab_size=nodes_count,
                                window_size=10,
                                device=None)
data_len = cbow_dataset.__len__()
n_steps  = (data_len - 1) // batch_size
cbow_loader = DataLoader(cbow_dataset, batch_size=batch_size,
                         shuffle=False, num_workers=12)

# loss = torch.mean(-1 * torch.log(cbow_out))
losses = []
model = CBOWHierSoftmax(nodes_count, 200)
model.cuda(0)


path = '/home/d3/study-projects/really_new/doc2vec/pytorch-word2vec/ckpt-lambda-scheduler/e199-lr0.001-loss5.236-w2vec-bible.ckpt.tar'
path1 = 'ckpt/e199-lr0.004-loss5.187-w2vec-bible.ckpt.tar'

torch_loaded = torch.load(path1)
    
print(torch_loaded['model_state_dict'])
model.load_state_dict(torch_loaded['model_state_dict'])
# model.load_state_dict(torch_loaded['model_state_dict']['embeddings.weight'])
print(model.embeddings.weight[0])

OrderedDict([('embeddings.weight', tensor([[-0.0799, -0.0462,  0.0638,  ...,  0.0286, -0.0208,  0.0190],
        [-0.0036, -0.0701, -0.0067,  ...,  0.0657,  0.0238, -0.0051],
        [ 0.0782,  0.0749, -0.0496,  ..., -0.0327, -0.1115, -0.0365],
        ...,
        [-0.0634, -0.0418, -0.0063,  ...,  0.0294, -0.0104,  0.0099],
        [-0.0558, -0.1040, -0.0373,  ..., -0.0392, -0.0563,  0.0774],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0'))])
tensor([-7.9949e-02, -4.6223e-02,  6.3770e-02,  2.4370e-02, -3.1488e-02,
        -1.9627e-02, -1.3042e-01, -2.3389e-02,  8.9069e-02, -3.5825e-02,
        -7.1607e-02, -1.2580e-02, -6.0275e-02,  8.1855e-03, -4.0679e-02,
        -1.4761e-01, -3.0337e-02, -3.0497e-02, -3.9316e-02, -6.6072e-02,
        -1.4983e-01, -7.6077e-02, -6.2963e-02, -1.8337e-02, -3.2213e-03,
        -1.6931e-01, -7.7741e-03,  5.0829e-02,  1.2319e-02, -7.4092e-02,
        -9.7028e-02,  1.6195e-02,  7.9325e-02, -1.1648e-01,  1.325

In [32]:
leaves_hash = {v:k for k, v in leaves_hash_inversed.items()}

In [33]:
leaves_hash['jesus']

5023

In [34]:
embedding_weights = model.embeddings.weight.detach().cpu().numpy()

In [35]:
embedding_weights.shape

(10670, 200)

In [36]:
word2vec = embedding_weights[:vocab_size]
word2vec.shape

(5335, 200)

# Helpers

In [37]:
def get_embeddings_index_of_word(word, leaves_hash=leaves_hash):
    huffman_inx = leaves_hash[word]
    return huffman_inx

def get_embedding_by_word(word, word2vec=word2vec):
    return word2vec[get_embeddings_index_of_word(word)]

# Nearest Neighbours

In [38]:
nbrs = NearestNeighbors(n_neighbors=25, algorithm='auto',
                        metric='minkowski', p=2).fit(word2vec)
%time distances, indices = nbrs.kneighbors(word2vec)
# distances, indices = nbrs.kneighbors()

CPU times: user 7.55 s, sys: 72 µs, total: 7.55 s
Wall time: 7.55 s


In [39]:
def print_nn_neighbours(w, indices=indices, distances=distances):
    print(f'word: {w}\nneighbours: ')
    inx = get_embeddings_index_of_word(w)
    for w_inx, dist in zip(indices[inx], distances[inx]):
        print('{} - {}'.format(get_word_by_embedding_index(w_inx), round(dist, 2)))

In [40]:
print_nn_neighbours('jesus')

word: jesus
neighbours: 
jesus - 0.0
pilate - 0.82
peter - 0.84
elias - 0.86
job - 0.86
elisha - 0.87
caesar - 0.88
samson - 0.89
elijah - 0.89
nazareth - 0.9
baptized - 0.9
others - 0.91
herod - 0.91
crucified - 0.91
manoah - 0.91
thank - 0.91
already - 0.91
answering - 0.91
past - 0.91
disciple - 0.91
eli - 0.92
parable - 0.92
philip - 0.92
baptism - 0.92
barabbas - 0.92


# Cos sim

In [41]:
cos_sim_matrix = cosine_similarity(word2vec, word2vec)

In [42]:
def get_word_by_embedding_index(inx, leaves_hash_inversed=leaves_hash_inversed):
    word = leaves_hash_inversed[inx]
    return word

In [43]:
def print_neighbours_cosine(w, cos_sim_matrix=cos_sim_matrix, topn=10):
    print(f'word: {w}\nneighbours: ')
    inx = get_embeddings_index_of_word(w)
        
    word_row_dists = cos_sim_matrix[inx]
    neighbours = np.argsort(-1 * word_row_dists)[:topn]
    for n in neighbours:
        print('{} - {:<3}'.format(get_word_by_embedding_index(n), round(word_row_dists[n], 2)))

In [44]:
print_neighbours_cosine('jesus', topn=25)

word: jesus
neighbours: 
jesus - 1.0
peter - 0.6499999761581421
pilate - 0.5799999833106995
christ - 0.5600000023841858
paul - 0.5400000214576721
certain - 0.5400000214576721
john - 0.5400000214576721
elias - 0.5199999809265137
job - 0.5
nathanael - 0.5
lazarus - 0.5
elisha - 0.49000000953674316
barabbas - 0.49000000953674316
believed - 0.49000000953674316
privately - 0.47999998927116394
caesar - 0.47999998927116394
elijah - 0.4699999988079071
grace - 0.4699999988079071
samson - 0.46000000834465027
abraham - 0.46000000834465027
question - 0.46000000834465027
baptized - 0.44999998807907104
nazareth - 0.44999998807907104
festus - 0.44999998807907104
manoah - 0.44999998807907104


In [30]:
print_neighbours_cosine('god', topn=25)

word: god
neighbours: 
god - 1.0
salvation - 0.6399999856948853
hosts - 0.6200000047683716
hope - 0.6100000143051147
grace - 0.6000000238418579
word - 0.5699999928474426
goodness - 0.5699999928474426
glory - 0.5600000023841858
righteousness - 0.5299999713897705
fear - 0.5199999809265137
christ - 0.5199999809265137
truth - 0.5099999904632568
prayer - 0.5099999904632568
care - 0.49000000953674316
lovingkindness - 0.47999998927116394
lord - 0.47999998927116394
chosen - 0.47999998927116394
chastisement - 0.47999998927116394
covenant - 0.47999998927116394
faith - 0.4699999988079071
enquire - 0.4699999988079071
request - 0.4699999988079071
power - 0.4699999988079071
am - 0.46000000834465027
delight - 0.46000000834465027


In [46]:
from gensim.models import word2vec

In [47]:
word2vec.Word2Vec?