# Document Encoder Model
- based on word2vec embeddings from gensim
- use a simple average of the word embeddings as the document embedding
- use a simple feedforward neural network as the encoder


In [1]:

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import gensim

In [3]:
# Load the word2vec model, extract embeddings, convert to torch tensor
w2v = gensim.models.Word2Vec.load('./word2vec/word2vec-gensim-text8.model')
vocab = w2v.wv.index_to_key
word_to_ix = {word: i for i, word in enumerate(vocab)}
embeddings_array = np.array([w2v.wv[word] for word in vocab])
embeddings = torch.tensor(embeddings_array, dtype=torch.float32)
print(embeddings.shape)

torch.Size([100038, 100])


In [4]:
embedding_layer = nn.Embedding.from_pretrained(embeddings, freeze=True)

In [5]:
word_index = torch.tensor([word_to_ix['example']], dtype=torch.long)
embedding = embedding_layer(word_index)
print(embedding)



tensor([[-0.1870, -2.4299, -0.1401, -2.1388,  1.7904,  0.0775,  1.6181, -1.1711,
         -2.6720, -0.4321,  0.0206,  0.2530,  0.8117, -2.0854,  0.1374,  0.1636,
         -0.6244, -1.0051, -2.1977,  0.6167, -0.5720, -1.1711, -0.9346,  1.0351,
         -4.5317, -2.3577,  0.9223,  1.1498,  1.8907,  0.6729, -1.2662,  0.2727,
         -0.7997, -2.3808, -1.1613,  0.3075,  1.9001,  2.0534,  0.1493, -1.9624,
         -0.1432, -0.8249,  0.5607,  0.7156,  2.2120,  0.4811,  0.3781, -0.2218,
         -1.3695,  0.9088,  0.2807, -0.7241,  1.0748,  1.0134, -0.5849, -1.3128,
         -0.2698,  0.4111,  1.8783,  0.3299,  1.4218, -0.3356, -2.0904,  0.5190,
          0.0786, -2.3097,  0.9336,  0.1404,  3.1056,  1.9720, -1.4331, -0.1387,
          0.4657, -2.6394,  0.6567,  0.1776,  0.7422, -1.3936, -1.5554,  1.5661,
          1.1278, -0.8638, -4.5095,  0.2993,  0.1132, -0.9151, -0.7786, -1.7957,
          3.2999, -0.7323, -0.7129, -1.7024, -1.0314, -1.1695,  0.1837,  1.2670,
          0.2361, -0.9295,  