# Implementation of word2vec using tensorflow

In [1]:
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
corpus_raw = 'He is the king . The king is royal . She is the royal  queen '
corpus_raw = corpus_raw.lower()

In [3]:
words = []
for word in corpus_raw.split():
    if word != '.': # because we don't want to treat . as a word
        words.append(word)
words = set(words) # so that all duplicate words are removed
word2int = {}
int2word = {}
vocab_size = len(words) # gives the total number of unique words
for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

In [4]:
print(word2int['queen'])

4


In [5]:
print(int2word[4])

queen


In [6]:
# raw sentences is a list of sentences.
raw_sentences = corpus_raw.split('.')
sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())

In [7]:
print(sentences)

[['he', 'is', 'the', 'king'], ['the', 'king', 'is', 'royal'], ['she', 'is', 'the', 'royal', 'queen']]


In [8]:
data = []
WINDOW_SIZE = 2
for sentence in sentences:
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : 
            if nb_word != word:
                data.append([word, nb_word])

print(data)

[['he', 'is'], ['he', 'the'], ['is', 'he'], ['is', 'the'], ['is', 'king'], ['the', 'he'], ['the', 'is'], ['the', 'king'], ['king', 'is'], ['king', 'the'], ['the', 'king'], ['the', 'is'], ['king', 'the'], ['king', 'is'], ['king', 'royal'], ['is', 'the'], ['is', 'king'], ['is', 'royal'], ['royal', 'king'], ['royal', 'is'], ['she', 'is'], ['she', 'the'], ['is', 'she'], ['is', 'the'], ['is', 'royal'], ['the', 'she'], ['the', 'is'], ['the', 'royal'], ['the', 'queen'], ['royal', 'is'], ['royal', 'the'], ['royal', 'queen'], ['queen', 'the'], ['queen', 'royal']]


In [9]:
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

x_train = [] # input word
y_train = [] # output word

for data_word in data:
    x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
    y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))

x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [10]:
print(x_train)

[[0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]]


In [11]:
print(x_train.shape, y_train.shape)

(34, 7) (34, 7)


In [12]:
# making placeholders for x_train and y_train
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [13]:
EMBEDDING_DIM = 5 # you can choose your own number
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias
hidden_representation = tf.add(tf.matmul(x,W1), b1)

In [14]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))

In [15]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) #make sure you do this!
# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iters = 100000
# train for n_iter iterations
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

loss is :  3.682039
loss is :  3.4491363
loss is :  3.2734125
loss is :  3.1325712
loss is :  3.0140889
loss is :  2.9111123
loss is :  2.819881
loss is :  2.7382176
loss is :  2.6647403
loss is :  2.5984654
loss is :  2.5386128
loss is :  2.4845092
loss is :  2.4355428
loss is :  2.391143
loss is :  2.3507793
loss is :  2.313959
loss is :  2.2802372
loss is :  2.2492192
loss is :  2.220561
loss is :  2.1939676
loss is :  2.1691883
loss is :  2.1460123
loss is :  2.12426
loss is :  2.1037793
loss is :  2.0844417
loss is :  2.0661354
loss is :  2.048766
loss is :  2.0322492
loss is :  2.0165136
loss is :  2.0014954
loss is :  1.9871389
loss is :  1.9733943
loss is :  1.9602181
loss is :  1.9475708
loss is :  1.9354179
loss is :  1.9237272
loss is :  1.9124705
loss is :  1.9016217
loss is :  1.8911573
loss is :  1.8810558
loss is :  1.8712974
loss is :  1.8618635
loss is :  1.8527374
loss is :  1.8439032
loss is :  1.8353463
loss is :  1.8270532
loss is :  1.8190107
loss is :  1.8112073


In [16]:
print(sess.run(W1))
print('----------')
print(sess.run(b1))
print('----------')

[[-1.3722712   1.2287599   1.6439074  -2.1723652   1.4108359 ]
 [-1.5940633   0.59839803  0.01077582 -0.07865455 -2.975707  ]
 [ 0.30928293 -0.6444132  -0.7906496   0.82736313  1.2338232 ]
 [ 1.7643046   2.1253257   1.9683312   0.33402464 -0.46715736]
 [-0.82404613 -0.5472902  -1.2554909   1.872602   -0.6221113 ]
 [ 1.7654988   0.04719747  0.30092427  0.82913405  0.33071882]
 [ 1.9236926  -0.43503487  0.5916694  -0.5097005  -0.8755283 ]]
----------
[ 0.8087427  -1.1210498  -0.7964143   0.29451057  0.72932833]
----------


In [17]:
vectors = sess.run(W1 + b1)
print(vectors)

[[-0.5635285   0.10771012  0.8474931  -1.8778546   2.1401641 ]
 [-0.7853206  -0.52265173 -0.7856385   0.21585602 -2.2463787 ]
 [ 1.1180257  -1.7654629  -1.5870639   1.1218737   1.9631515 ]
 [ 2.5730474   1.0042759   1.171917    0.6285352   0.26217097]
 [-0.01530343 -1.66834    -2.0519052   2.1671126   0.10721701]
 [ 2.5742414  -1.0738523  -0.49549004  1.1236446   1.0600471 ]
 [ 2.7324352  -1.5560846  -0.20474494 -0.2151899  -0.14619994]]


In [18]:
print(vectors[ word2int['queen'] ])

[-0.01530343 -1.66834    -2.0519052   2.1671126   0.10721701]


In [19]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))


def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [20]:
print(int2word[find_closest(word2int['king'], vectors)])
print(int2word[find_closest(word2int['queen'], vectors)])
print(int2word[find_closest(word2int['royal'], vectors)])

he
king
he
