In [None]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.nn.functional as F
from collections import Counter
import numpy as np
import scipy
from sklearn.metrics.pairwise import cosine_similarity
import os
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/My Drive/Colab Notebooks"
os.chdir(path)
os.listdir(path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.FloatTensor
print('will use', device)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
will use cuda


In [None]:
with open('text8.train.txt') as f:
    text = f.read()

text = text.lower().split()
vocab = dict(Counter(text).most_common(9999))
vocab['UNK'] = len(text) - np.sum(list(vocab.values()))
vocab_size = len(vocab)
# build vocabulary
token2idx = {t: i for i, t in enumerate(vocab)}
idx2token = {i: t for i, t in enumerate(vocab)}

word_count = np.array([cnt for cnt in vocab.values()])
word_freqs = word_count / np.sum(word_count)
word_freqs = word_freqs ** 0.75
print(word_freqs[: 3])

[0.12530107 0.08108672 0.06208324]


In [None]:
C = 3
K = 5

class Word2VecDataset(Data.Dataset):
    def __init__(self, text, token2idx, word_freqs):
        super().__init__()
        self.text_encoded = [token2idx.get(token, token2idx['UNK']) for token in text]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_freqs = torch.Tensor(word_freqs)

    def __len__(self):
        return len(self.text_encoded)
    
    def __getitem__(self, idx):
        center_words = self.text_encoded[idx]
        context_idx = list(range(idx - C, idx)) + list(range(idx + 1, idx + 1 + C))
        context_idx = [i % len(self.text_encoded) for i in context_idx]
        context_words = self.text_encoded[context_idx]
        # print(context_words.shape)
        neg_words = torch.multinomial(self.word_freqs, K * len(context_words), True)
        while len(set(context_words) & set(neg_words)) > 0:
            neg_words = torch.multinomial(self.word_freqs, K * len(context_words), True)

        return center_words, context_words, neg_words

In [None]:
batch_size = 256
dataset = Word2VecDataset(text, token2idx, word_freqs)
data_loader = Data.DataLoader(dataset, batch_size, True)

In [None]:
next(iter(dataset))


(tensor(4813),
 tensor([  50, 9999,  393, 3139,   11,    5]),
 tensor([ 787,   19,  140, 7325, 5804, 3759,   38, 1884, 2395, 1284, 3834, 1648,
           16,  947, 9999,    1,  829, 9910, 3639,    0,    2, 6334,   50,  586,
         1091, 1156,  153, 1414, 2747, 3793]))

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, center_words, context_words, neg_words):
        center_embed = self.in_embedding(center_words)
        context_embed = self.out_embedding(context_words)
        neg_embed = self.out_embedding(neg_words)

        center_embed = center_embed.unsqueeze(2)
        
        pos_dot = torch.bmm(context_embed, center_embed)
        neg_dot = torch.bmm(-neg_embed, center_embed)

        pos_dot = pos_dot.squeeze(2)
        neg_dot = neg_dot.squeeze(2)

        loss_pos = F.logsigmoid(pos_dot).sum(1)
        loss_neg = F.logsigmoid(neg_dot).sum(1)
        return -(loss_pos + loss_neg)

    def center_embedding(self):
        return self.in_embedding.weight.detach().cpu().numpy()

In [None]:
embedding_dim = 100
model = Model(vocab_size, embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
for i, (center_words, context_words, neg_words) in enumerate(data_loader):
    center_words, context_words, neg_words = center_words.long().to(device), context_words.long().to(device), neg_words.long().to(device)
    optimizer.zero_grad()
    loss = model(center_words, context_words, neg_words).mean().to(device)
    loss.backward()
    optimizer.step()
    if i % 10000 == 0:
        print('iteration', i, loss.item())

# embedding_weights = model.center_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(embedding_dim))

iteration 0 149.3065948486328
iteration 10000 14.635737419128418
iteration 20000 13.880234718322754
iteration 30000 14.170469284057617
iteration 40000 13.659147262573242
iteration 50000 13.598966598510742


In [None]:
embedding_weights = model.center_embedding()
def find_nearest(word):
    index = token2idx[word]
    embedding = embedding_weights[index]
    cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
    return [idx2token[i] for i in cos_dis.argsort()[: 3]]

In [None]:
for word in ["two", "america", "computer"]:
    print(word, find_nearest(word))

two ['two', 'four', 'three']
america ['america', 'africa', 'europe']
computer ['computer', 'computing', 'interface']
