# Implementation of word2vec using numpy

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import re
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

In [2]:
class word2vec():

    def __init__(self):
        self.n = settings['n']
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']

    def generate_training_data(self, settings, corpus):
        # Find unique word counts using dictonary
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1

        self.v_count = len(word_counts.keys())

        self.words_list = list(word_counts.keys())

        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))

        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

        training_data = []
        for sentence in corpus:
            sent_len = len(sentence)

            for i, word in enumerate(sentence):
                w_target = self.word2onehot(sentence[i])
                w_context = []

                for j in range(i - self.window, i + self.window+1):
                    if j != i and j <= sent_len-1 and j >= 0:
                        w_context.append(self.word2onehot(sentence[j]))

                training_data.append([w_target, w_context])

        return np.array(training_data)

    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)]

        word_index = self.word_index[word]

        word_vec[word_index] = 1

        return word_vec

    def train(self, training_data):
        self.w1 = np.array(getW1)
        self.w2 = np.array(getW2)

        for i in range(self.epochs):
            # Intialise loss to 0
            self.loss = 0
            for w_t, w_c in training_data:
                y_pred, h, u = self.forward_pass(w_t)

                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                self.backprop(EI, h, w_t)
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))

            print('Epoch:', i, "Loss:", self.loss)

    def forward_pass(self, x):
        h = np.dot(x, self.w1)
        u = np.dot(h, self.w2)
        y_c = self.softmax(u)
        return y_c, h, u

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))

        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)

    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w

    def vec_sim(self, word, top_n):
        v_w1 = self.word_vec(word)
        word_sim = {}

        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)
        for word, sim in words_sorted[:top_n]:
            print(word, sim)

In [3]:
## Randomly initialise
getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744, 0.677, -0.427, -0.066],
        [-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685, 0.973, -0.070, -0.811],
        [-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401, -0.921, -0.158, 0.529],
        [0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879, 0.303, 0.332, -0.275],
        [0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377, -0.274, 0.182, -0.237],
        [0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088, -0.902, -0.450, -0.408],
        [0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687, -0.215, -0.807, 0.612],
        [0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344, 0.868, 0.913, 0.429],
        [0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817, -0.408, -0.286, 0.149]]

getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551],
        [-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968],
        [-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690],
        [0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020],
        [-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599],
        [0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017],
        [-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757],
        [-0.054, 0.397, -0.017, -0.563, -0.551, 0.465, -0.596, -0.413, -0.395],
        [-0.838, 0.053, -0.160, -0.164, -0.671, 0.140, -0.149, 0.708, 0.425],
        [0.096, -0.995, -0.313, 0.881, -0.402, -0.631, -0.660, 0.184, 0.487]]

In [8]:
settings = {
    'window_size': 3,           # context window +- center word
    'n': 10,                    # dimensions of word embeddings, also refer to size of hidden layer
    'epochs': 50,             # number of training epochs
    'learning_rate': 0.01       # learning rate
}

In [9]:
text = "natural language processing and machine learning is fun and exciting"

In [10]:
corpus = [[word.lower() for word in text.split()]]
w2v = word2vec()

training_data = w2v.generate_training_data(settings, corpus)
w2v.train(training_data)

Epoch: 0 Loss: 119.65818593710692
Epoch: 1 Loss: 116.62024226191745
Epoch: 2 Loss: 114.04917631196679
Epoch: 3 Loss: 111.84861986765249
Epoch: 4 Loss: 109.94539024241644
Epoch: 5 Loss: 108.28344255689026
Epoch: 6 Loss: 106.81938622674572
Epoch: 7 Loss: 105.5192275932522
Epoch: 8 Loss: 104.3560203077778
Epoch: 9 Loss: 103.30816666968074
Epoch: 10 Loss: 102.35817952351488
Epoch: 11 Loss: 101.4917695628716
Epoch: 12 Loss: 100.69716395032296
Epoch: 13 Loss: 99.9645910749294
Epoch: 14 Loss: 99.2858861257711
Epoch: 15 Loss: 98.65418568371791
Epoch: 16 Loss: 98.06368875665049
Epoch: 17 Loss: 97.50946801920685
Epoch: 18 Loss: 96.98731941783336
Epoch: 19 Loss: 96.49364139620639
Epoch: 20 Loss: 96.02533720194222
Epoch: 21 Loss: 95.57973532953872
Epoch: 22 Loss: 95.15452432161283
Epoch: 23 Loss: 94.74769901563431
Epoch: 24 Loss: 94.357515971943
Epoch: 25 Loss: 93.98245631007318
Epoch: 26 Loss: 93.62119455588942
Epoch: 27 Loss: 93.27257239137603
Epoch: 28 Loss: 92.93557642349091
Epoch: 29 Loss: 92

In [11]:
# Get vector for word
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)

print()

# Find similar words
w2v.vec_sim("machine", 5)

machine [ 0.85508876 -0.70708184  0.79919777  0.00759425 -0.12248265 -0.39324106
  0.5802936   0.01742696  0.16704509 -0.46931715]

machine 1.0
and 0.6133790185534155
natural 0.5027479803811664
fun 0.41469457323438896
is 0.37156064344490086


In [7]:
# Get vector for word
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)

print()

# Find similar words
w2v.vec_sim("machine", 5)

machine [ 1.17935753 -1.31396665  1.51816169 -0.32881615 -0.04215376 -0.28818402
  0.62616027  0.34787486 -0.01300727 -0.83572361]

machine 0.9999999999999999
natural 0.533821416293253
exciting 0.30790584432397805
fun 0.14056977204437102
learning 0.13235136587722043
