In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
import nltk

In [2]:
sentences = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')).data

regex = re.compile('[^a-zA-Zа-яА-Я ]')
sentences = [regex.sub('', ' '.join(nltk.word_tokenize(sent.strip().lower()))).split() 
             for sent in sentences] # list of list of words

sentences = [sent for sent in sentences if len(sent) > 10]

np.random.shuffle(sentences)

In [3]:
class Vocab:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
    
    def index(self, sentences):
        word2cnt = {}
        for sent in sentences:
            for word in sent:
                if word not in self.word2index:
                    self.word2index[word] = len(self.word2index)
                    self.index2word[self.word2index[word]] = word
                    word2cnt[word] = 0
                
                word2cnt[word] += 1
                
        self.distribution = np.array([word2cnt[self.index2word[i]] for i in range(len(word2cnt))])
        self.distribution = self.distribution / self.distribution.sum()
        self.distribution = self.distribution ** 0.75
        self.distribution = self.distribution / self.distribution.sum()
                    
    def get_distribution(self):
        return self.distribution
                    
    def get_size(self):
        return len(self.word2index)

In [4]:
vocab = Vocab()
vocab.index(sentences)
vocab.get_size()

127314

In [6]:
class Word2Vec:
    def __init__(self):
        self.sent_i = None
        self.word_i = None
        self.window_i = None

    def __initialize(self, window_size, vocab, embed_size):
        if self.window_i is None:
            print('Initializing weights')

            self.vocab = vocab
            self.center_vectors = np.random.uniform(-0.8, 0.8, (vocab.get_size(), embed_size))
            self.context_vectors = np.random.uniform(-0.8, 0.8, (vocab.get_size(), embed_size))

            self.sent_i = 0
            self.window_i = -window_size
            self.word_i = window_size

    def __indexes_update(self, window_size, sent_size, corpus_size):
        self.window_i += 1
        if self.window_i == 0:
            self.window_i = 1

        if self.window_i == window_size + 1:
            self.window_i = - window_size

            self.word_i += 1
            if self.word_i + window_size >= sent_size:
                self.word_i = window_size
                self.sent_i += 1
                if self.sent_i == corpus_size:
                    self.sent_i = 0

    def train(self, vocab, sentences,
              embed_size=100, window_size=3,
              neg_sampling_size=15, learning_rate=0.01,
              iters=10000, print_every=500):
        self.__initialize(window_size, vocab, embed_size)

        cur_loss = 0

        i = 1
        while i <= iters:
            center_id = vocab.word2index[sentences[self.sent_i][self.word_i]]
            context_id = vocab.word2index[sentences[self.sent_i][self.word_i + self.window_i]]

            neg_samples_ids = np.random.choice(range(vocab.get_size()), neg_sampling_size, p=vocab.get_distribution())
            neg_samples = self.context_vectors[neg_samples_ids]

            cur_dot = np.inner(self.center_vectors[center_id], self.context_vectors[context_id])
            dots = neg_samples.dot(self.center_vectors[center_id].reshape(-1, 1)).reshape(-1, 1)
            sigmoids = 1 / (1 + np.exp(dots))

            # gradient calculation
            dLdv = -self.context_vectors[context_id] * (1 - 1 / (1 + np.exp(cur_dot))) + np.sum(neg_samples * sigmoids, axis=0)
            dLdu = -self.center_vectors[center_id] * (1 - 1 / (1 + np.exp(cur_dot)))
            dLdw = self.center_vectors[center_id] * sigmoids 

            # gradient step
            self.center_vectors[center_id] -= learning_rate * dLdv / neg_sampling_size
            self.context_vectors[neg_samples_ids] -= learning_rate * dLdw
            self.context_vectors[context_id] -= learning_rate * dLdu

            self.__indexes_update(window_size, len(sentences[self.sent_i]), len(sentences))

            loss = np.log(1 + np.exp(-cur_dot)) - np.log(1 - sigmoids).sum()
            cur_loss = cur_loss * 0.95 + loss * 0.05

            if i % print_every == 0:
                print(f'Iteration {i} loss = {round(cur_loss, 2)}')
                print(round(self.center_vectors.sum(), 1), round(self.context_vectors.sum(), 1))
            i += 1

    def get_vector(self, word):
        return self.center_vectors[self.vocab.word2index[word]]

    def most_closest(self, word, n=10):
        base = self.get_vector(word).reshape(-1, 1)

        distances = self.center_vectors.dot(base) / np.linalg.norm(self.center_vectors, axis=1) / np.linalg.norm(base)

        most_closest_indexes = distances.flatten().argsort()[-n:][::-1]

        return [(self.vocab.index2word[ind], distances.flatten()[ind]) for ind in most_closest_indexes]


In [7]:
wv = Word2Vec()
wv.train(vocab, sentences, learning_rate=0.01, iters=25, print_every=3)

Initializing weights
Iteration 3 loss = 2.31
-3384.2 -756.6
Iteration 6 loss = 3.99
-3384.2 -757.4
Iteration 9 loss = 5.27
-3384.2 -756.4
Iteration 12 loss = 6.9
-3384.2 -755.4
Iteration 15 loss = 7.97
-3384.2 -755.8
Iteration 18 loss = 9.79
-3384.2 -756.4
Iteration 21 loss = 10.94
-3384.2 -757.2
Iteration 24 loss = 11.46
-3384.1 -757.8
