In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
import nltk
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
sentences = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')).data

regex = re.compile('[^a-zA-Zа-яА-Я ]')
sentences = [regex.sub('', ' '.join(nltk.word_tokenize(sent.strip().lower()))).split() 
             for sent in sentences] # list of list of words

sentences = [sent for sent in sentences if len(sent) > 10]

np.random.shuffle(sentences)

In [3]:
class Vocab:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
    
    def index(self, sentences):
        word2cnt = {}
        for sent in sentences:
            for word in sent:
                if word not in self.word2index:
                    self.word2index[word] = len(self.word2index)
                    self.index2word[self.word2index[word]] = word
                    word2cnt[word] = 0
                
                word2cnt[word] += 1
                
        self.distribution = np.array([word2cnt[self.index2word[i]] for i in range(len(word2cnt))])
        self.distribution = self.distribution / self.distribution.sum()
        self.distribution = self.distribution ** 0.75
        self.distribution = self.distribution / self.distribution.sum()
                    
    def get_distribution(self):
        return self.distribution
                    
    def get_size(self):
        return len(self.word2index)

In [4]:
vocab = Vocab()
vocab.index(sentences)
vocab.get_size()

127314

In [17]:
class Word2Vec:
    def __init__(self):
        self.sent_i = None
        self.word_i = None
        self.window_i = None
        self.losses = []

    def __initialize(self, window_size, vocab, embed_size):
        if self.window_i is None:
            print('Initializing weights')

            self.vocab = vocab
            self.center_vectors = np.random.uniform(-0.8, 0.8, (vocab.get_size(), embed_size))
            self.context_vectors = np.random.uniform(-0.8, 0.8, (vocab.get_size(), embed_size))

            self.sent_i = 0
            self.window_i = -window_size
            self.word_i = window_size

    def __indexes_update(self, window_size, sent_size, corpus_size):
        self.window_i += 1
        if self.window_i == 0:
            self.window_i = 1

        if self.window_i == window_size + 1:
            self.window_i = - window_size

            self.word_i += 1
            if self.word_i + window_size >= sent_size:
                self.word_i = window_size
                self.sent_i += 1
                if self.sent_i == corpus_size:
                    self.sent_i = 0

    def train(self, vocab, sentences,
              embed_size=100, window_size=3,
              neg_sampling_size=15, learning_rate=0.01,
              iters=10000, print_every=500):
        self.__initialize(window_size, vocab, embed_size)

        cur_loss = 0

        i = 1
        while i <= iters:
            center_id = vocab.word2index[sentences[self.sent_i][self.word_i]]
            context_id = vocab.word2index[sentences[self.sent_i][self.word_i + self.window_i]]

            neg_samples_ids = np.random.choice(range(vocab.get_size()), neg_sampling_size, p=vocab.get_distribution())
            neg_samples = self.context_vectors[neg_samples_ids]

            cur_dot = np.inner(self.center_vectors[center_id], self.context_vectors[context_id])
            dots = neg_samples.dot(self.center_vectors[center_id].reshape(-1, 1)).reshape(-1, 1)
            sigmoids = 1 / (1 + np.exp(-dots))

            # gradient calculation
            dLdv = -self.context_vectors[context_id] * (1 - 1 / (1 + np.exp(-cur_dot))) + np.sum(neg_samples * sigmoids, axis=0)
            dLdu = -self.center_vectors[center_id] * (1 - 1 / (1 + np.exp(-cur_dot)))
            dLdw = neg_samples * sigmoids 

            # gradient step
            self.center_vectors[center_id] -= learning_rate * dLdv
            self.context_vectors[neg_samples_ids] -= learning_rate * dLdw
            self.context_vectors[context_id] -= learning_rate * dLdu

            self.__indexes_update(window_size, len(sentences[self.sent_i]), len(sentences))

            loss = -np.log(1 + np.exp(-cur_dot)) - np.log(1 - sigmoids).sum()
            cur_loss = cur_loss * 0.95 + loss * 0.05

            if i % print_every == 0:
                print(f'Iteration {i} loss = {round(cur_loss, 2)}')
                self.losses.append(cur_loss)
            i += 1

    def get_vector(self, word):
        return self.center_vectors[self.vocab.word2index[word]]

    def most_closest(self, word, n=10):
        base = self.get_vector(word).reshape(-1, 1)

        distances = self.center_vectors.dot(base) / np.linalg.norm(self.center_vectors, axis=1).reshape(-1, 1) / np.linalg.norm(base)

        most_closest_indexes = distances.flatten().argsort()[-n:][::-1]

        return [(self.vocab.index2word[ind], distances.flatten()[ind]) for ind in most_closest_indexes]


In [18]:
wv = Word2Vec()
wv.train(vocab, sentences,
         learning_rate=0.07,
         iters=50000, 
         print_every=500, 
         window_size=3,
         neg_sampling_size=15)

Initializing weights
Iteration 500 loss = 12.99
Iteration 1000 loss = 13.6
Iteration 1500 loss = 12.25
Iteration 2000 loss = 12.7
Iteration 2500 loss = 11.29
Iteration 3000 loss = 12.86
Iteration 3500 loss = 11.95
Iteration 4000 loss = 12.83
Iteration 4500 loss = 12.71
Iteration 5000 loss = 12.51
Iteration 5500 loss = 12.82
Iteration 6000 loss = 11.79
Iteration 6500 loss = 12.33
Iteration 7000 loss = 12.13
Iteration 7500 loss = 11.37
Iteration 8000 loss = 11.69
Iteration 8500 loss = 11.82
Iteration 9000 loss = 12.09
Iteration 9500 loss = 12.15
Iteration 10000 loss = 11.92
Iteration 10500 loss = 11.12
Iteration 11000 loss = 11.93
Iteration 11500 loss = 10.74
Iteration 12000 loss = 11.8
Iteration 12500 loss = 11.15
Iteration 13000 loss = 11.1
Iteration 13500 loss = 11.89
Iteration 14000 loss = 12.14
Iteration 14500 loss = 11.02
Iteration 15000 loss = 12.41
Iteration 15500 loss = 11.53
Iteration 16000 loss = 11.54
Iteration 16500 loss = 12.42
Iteration 17000 loss = 11.05
Iteration 17500 l

KeyboardInterrupt: 

In [None]:
wv.most_closest('man')

In [None]:
wv.most_closest('good')