In [1]:
import numpy as np
import re
from sklearn.datasets import fetch_20newsgroups
import nltk

In [2]:
sentences = fetch_20newsgroups(subset='train').data

regex = re.compile('[^a-zA-Z ]')
sentences = [regex.sub('', ' '.join(nltk.word_tokenize(sent.strip().lower()))).split() 
             for sent in sentences] # list of list of words

In [3]:
class Vocab:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
    
    def index(self, sentences):
        for sent in sentences:
            for word in sent:
                if word not in self.word2index:
                    self.word2index[word] = len(self.word2index)
                    self.index2word[self.word2index[word]] = word
                    
    def get_size(self):
        return len(self.word2index)

In [4]:
vocab = Vocab()
vocab.index(sentences)
vocab.get_size()

121328

In [5]:
# drop not freq words?

In [59]:
class Word2Vec:
    def __init__(self):
        self.sent_i = 0
        self.word_i = 0
        self.window_i = 0
    
    def __initialize(self, window_size, vocab, embed_size):
        if self.window_i == 0:
            print('Initializing weights')
            
            self.vocab = vocab
            self.center_vectors = np.random.randn(vocab.get_size(), embed_size) * 0.03
            self.context_vectors = np.random.randn(vocab.get_size(), embed_size) * 0.03
            
            self.center_vectors = self.center_vectors / np.linalg.norm(self.center_vectors, axis=1).reshape(-1, 1)
            self.context_vectors = self.context_vectors / np.linalg.norm(self.context_vectors, axis=1).reshape(-1, 1)
            
            self.window_i = -window_size
            self.word_i = window_size
        
    def __indexes_update(self, window_size, sent_size, corpus_size):
            self.window_i += 1
            if self.window_i == 0:
                self.window_i = 1
            
            if self.window_i == window_size + 1:
                self.window_i = - window_size
                
                self.word_i += 1
                if self.word_i + window_size == sent_size:
                    self.word_i = window_size
                    self.sent_i += 1
                    if self.sent_i == corpus_size:
                        self.sent_i = 0
    
    def train(self, vocab, sentences,
              embed_size=64, window_size=3,
              learning_rate=0.03, iters=10000, print_every=500):
        self.__initialize(window_size, vocab, embed_size)
        
        i = 0
        while i < iters:
            center_id = vocab.word2index[sentences[self.sent_i][self.word_i]]
            context_id = vocab.word2index[sentences[self.sent_i][self.word_i + self.window_i]]
            
            all_probs = self.context_vectors.dot(self.center_vectors[center_id].reshape(-1, 1))
            
            cur_prob = all_probs[context_id]
            all_probs = np.exp(all_probs)
            all_probs_sum = all_probs.sum()
        
            # gradient calculation
            dLdv = -self.context_vectors[context_id] + np.sum(self.context_vectors, axis=0) / all_probs_sum
            dLdu = -self.center_vectors[center_id] / all_probs_sum
    
            # gradient step
            prev_center = self.center_vectors[center_id].copy()
            self.center_vectors[center_id] -= learning_rate * dLdv / len(sentences)
            self.context_vectors -= learning_rate * dLdu  / len(sentences)
            self.context_vectors[context_id] += prev_center
            
            self.__indexes_update(window_size, len(sentences[self.sent_i]), len(sentences))
              
            if i % print_every == 0:
                loss = -cur_prob + np.log(all_probs_sum)
                print(f'Iteration {i} loss = {round(loss[0], 2)}')
                self.center_vectors = self.center_vectors / np.linalg.norm(self.center_vectors, axis=1).reshape(-1, 1)
                self.context_vectors = self.context_vectors / np.linalg.norm(self.context_vectors, axis=1).reshape(-1, 1)
            i += 1
            
        self.center_vectors = self.center_vectors / np.linalg.norm(self.center_vectors, axis=1).reshape(-1, 1)
        self.context_vectors = self.context_vectors / np.linalg.norm(self.context_vectors, axis=1).reshape(-1, 1)
            
        
    def get_vector(self, word):
        return self.context_vectors[self.vocab.word2index[word]]
    
    def most_closest(self, word, n=10):
        base = self.get_vector(word).reshape(-1, 1)
        
        distances = self.context_vectors.dot(base).flatten()
        
        most_closest_indexes = distances.argsort()[-n:][::-1]
        
        return [(self.vocab.index2word[ind], distances[ind]) for ind in most_closest_indexes]
        

In [60]:
wv = Word2Vec()
wv.train(vocab, sentences, iters=350000, print_every=4000)

Initializing weights
Iteration 0 loss = 11.82
Iteration 4000 loss = 10.3
Iteration 8000 loss = 11.74
Iteration 12000 loss = 11.79
Iteration 16000 loss = 11.54
Iteration 20000 loss = 7.2
Iteration 24000 loss = 9.78
Iteration 28000 loss = 11.99
Iteration 32000 loss = 10.35
Iteration 36000 loss = 11.98
Iteration 40000 loss = 12.01
Iteration 44000 loss = 11.52
Iteration 48000 loss = 11.26
Iteration 52000 loss = 11.62
Iteration 56000 loss = 12.26
Iteration 60000 loss = 10.07
Iteration 64000 loss = 16.42
Iteration 68000 loss = 11.47
Iteration 72000 loss = 11.34
Iteration 76000 loss = 11.94
Iteration 80000 loss = 10.8
Iteration 84000 loss = 11.81
Iteration 88000 loss = 11.69
Iteration 92000 loss = 13.42
Iteration 96000 loss = 10.95
Iteration 100000 loss = 0.0
Iteration 104000 loss = 11.59
Iteration 108000 loss = 11.74
Iteration 112000 loss = 11.79
Iteration 116000 loss = 18.38
Iteration 120000 loss = 11.68
Iteration 124000 loss = 11.17
Iteration 128000 loss = 11.13
Iteration 132000 loss = 12.

KeyboardInterrupt: 

In [66]:
wv.train(vocab, sentences, iters=350000, print_every=4000)

Iteration 0 loss = 11.24
Iteration 4000 loss = 11.52
Iteration 8000 loss = 12.12
Iteration 12000 loss = 15.04
Iteration 16000 loss = 11.68
Iteration 20000 loss = 1.79
Iteration 24000 loss = 12.06
Iteration 28000 loss = 11.79
Iteration 32000 loss = 11.51
Iteration 36000 loss = 11.76
Iteration 40000 loss = 11.5
Iteration 44000 loss = 11.86
Iteration 48000 loss = 9.95




Iteration 52000 loss = inf
Iteration 56000 loss = inf
Iteration 60000 loss = inf
Iteration 64000 loss = inf
Iteration 68000 loss = 31.5
Iteration 72000 loss = 9.17
Iteration 76000 loss = inf
Iteration 80000 loss = inf


KeyboardInterrupt: 

In [67]:
wv.most_closest('man')

[('ax', 902.8268596428297),
 ('max', 69.31361042840933),
 ('a', 9.466874296708912),
 ('q', 6.486250142809),
 ('qqf', 4.568611394063934),
 ('l', 4.3031982365743575),
 ('u', 2.842029344617142),
 ('djax', 2.7611727687806846),
 ('um', 2.6184227591147025),
 ('c', 2.327157297184751)]