In [237]:
import numpy as np
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
import nltk
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [238]:
sentences = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')).data

regex = re.compile('[^a-zA-Zа-яА-Я ]')
sentences = [regex.sub('', ' '.join(nltk.word_tokenize(sent.strip().lower()))).split() 
             for sent in sentences] # list of list of words

sentences = [sent for sent in sentences if len(sent) > 10]

np.random.shuffle(sentences)

In [239]:
class Vocab:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
    
    def index(self, sentences):
        word2cnt = {}
        for sent in sentences:
            for word in sent:
                if word not in self.word2index:
                    self.word2index[word] = len(self.word2index)
                    self.index2word[self.word2index[word]] = word
                    word2cnt[word] = 0
                
                word2cnt[word] += 1
                
        self.distribution = np.array([word2cnt[self.index2word[i]] for i in range(len(word2cnt))])
        self.distribution = self.distribution / self.distribution.sum()
        self.distribution = self.distribution ** 0.75
        self.distribution = self.distribution / self.distribution.sum()
                    
    def get_distribution(self):
        return self.distribution
                    
    def get_size(self):
        return len(self.word2index)

In [240]:
vocab = Vocab()
vocab.index(sentences)
vocab.get_size()

127314

In [241]:
class Word2Vec:
    def __init__(self):
        self.sent_i = None
        self.word_i = None
        self.window_i = None

    def __initialize(self, window_size, vocab, embed_size):
        if self.window_i is None:
            print('Initializing weights')

            self.vocab = vocab
            self.center_vectors = np.random.uniform(-0.8, 0.8, (vocab.get_size(), embed_size))
            self.context_vectors = np.random.uniform(-0.8, 0.8, (vocab.get_size(), embed_size))

            self.sent_i = 0
            self.window_i = -window_size
            self.word_i = window_size

    def __indexes_update(self, window_size, sent_size, corpus_size):
        self.window_i += 1
        if self.window_i == 0:
            self.window_i = 1

        if self.window_i == window_size + 1:
            self.window_i = - window_size

            self.word_i += 1
            if self.word_i + window_size >= sent_size:
                self.word_i = window_size
                self.sent_i += 1
                if self.sent_i == corpus_size:
                    self.sent_i = 0

    def train(self, vocab, sentences,
              embed_size=100, window_size=3,
              neg_sampling_size=15, learning_rate=0.01,
              iters=10000, print_every=500):
        self.__initialize(window_size, vocab, embed_size)

        cur_loss = 0

        i = 1
        while i <= iters:
            center_id = vocab.word2index[sentences[self.sent_i][self.word_i]]
            context_id = vocab.word2index[sentences[self.sent_i][self.word_i + self.window_i]]

            neg_samples_ids = np.random.choice(range(vocab.get_size()), neg_sampling_size, p=vocab.get_distribution())
            neg_samples = self.context_vectors[neg_samples_ids]

            cur_dot = np.inner(self.center_vectors[center_id], self.context_vectors[context_id])
            dots = neg_samples.dot(self.center_vectors[center_id].reshape(-1, 1)).reshape(-1, 1)
            sigmoids = 1 / (1 + np.exp(-dots))

            # gradient calculation
            dLdv = -self.context_vectors[context_id] * (1 - 1 / (1 + np.exp(-cur_dot))) + np.sum(neg_samples * sigmoids, axis=0)
            dLdu = -self.center_vectors[center_id] * (1 - 1 / (1 + np.exp(-cur_dot)))
            dLdw = neg_samples * sigmoids 

            # gradient step
            assert self.context_vectors[neg_samples_ids].shape == dLdw.shape
            assert self.center_vectors[center_id].shape == dLdv.shape
            assert self.context_vectors[context_id].shape == dLdu.shape
            self.center_vectors[center_id] -= learning_rate * dLdv
            self.context_vectors[neg_samples_ids] -= learning_rate * dLdw
            self.context_vectors[context_id] -= learning_rate * dLdu

            self.__indexes_update(window_size, len(sentences[self.sent_i]), len(sentences))

            loss = -np.log(1 + np.exp(-cur_dot)) - np.log(1 - sigmoids).sum()
            cur_loss = cur_loss * 0.9 + loss * 0.1

            if i % print_every == 0:
                print(f'Iteration {i} loss = {round(cur_loss, 2)}')
            i += 1

    def get_vector(self, word):
        return self.center_vectors[self.vocab.word2index[word]]

    def most_closest(self, word, n=10):
        base = self.get_vector(word).reshape(-1, 1)

        distances = self.center_vectors.dot(base) / np.linalg.norm(self.center_vectors, axis=1).reshape(-1, 1) / np.linalg.norm(base)

        most_closest_indexes = distances.flatten().argsort()[-n:][::-1]

        return [(self.vocab.index2word[ind], distances.flatten()[ind]) for ind in most_closest_indexes]


In [258]:
wv = Word2Vec()
wv.train(vocab, sentences,
         learning_rate=0.001,
         iters=1000, 
         print_every=50, 
         window_size=3,
         neg_sampling_size=13)

Initializing weights
Iteration 50 loss = 12.85
Iteration 100 loss = 13.33
Iteration 150 loss = 12.92
Iteration 200 loss = 11.46
Iteration 250 loss = 12.91
Iteration 300 loss = 13.85
Iteration 350 loss = 14.89
Iteration 400 loss = 11.21
Iteration 450 loss = 11.63
Iteration 500 loss = 14.5
Iteration 550 loss = 14.29
Iteration 600 loss = 13.92
Iteration 650 loss = 14.06
Iteration 700 loss = 13.34
Iteration 750 loss = 13.62
Iteration 800 loss = 14.08
Iteration 850 loss = 13.44
Iteration 900 loss = 14.29
Iteration 950 loss = 13.08
Iteration 1000 loss = 13.56


In [254]:
wv.most_closest('man')

[('man', 1.0),
 ('nejmeditorial', 0.4264344807338889),
 ('airlajzng', 0.42028183883569553),
 ('towwang', 0.4048745382196957),
 ('bebek', 0.3875368508252413),
 ('relabel', 0.38677090034981176),
 ('hypocricy', 0.3801125952617559),
 ('uninhabitable', 0.374343688735698),
 ('sherwood', 0.36954786089546726),
 ('arterial', 0.36742246638254755)]

In [255]:
wv.most_closest('good')

[('good', 1.0000000000000002),
 ('stripmine', 0.42004370516793943),
 ('tzrgmrlkv', 0.4049654637602054),
 ('siebenlist', 0.3855192005132425),
 ('broadcasts', 0.3852777744039119),
 ('sillyness', 0.3844474234029582),
 ('timothypillsbury', 0.38427271185077466),
 ('acessing', 0.37883139339747945),
 ('prohphet', 0.37838104507596776),
 ('starsgsfcnasagov', 0.3748146025485971)]