### word2vec skip-gram model with NCE loss

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf
import time

import utils
import word2vec_utils

# Model hyperparameters
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 200            # dimension of the word embedding vectors
SKIP_WINDOW = 2             # the context window
NUM_SAMPLED = 10            # number of negative examples to sample
THRESHOLD = 1e-5            # subsampling threshold
LEARNING_RATE = 3e-2
EPOCH = 10
NUM_TRAIN_STEPS = 600000
VISUAL_FLD = 'visualization'
SKIP_STEP = 10000
SAVE_EVERY = 150000

# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
NUM_VISUALIZE = 2000        # number of tokens to visualize

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
def gen():
    yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, 
                                        BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD, THRESHOLD)

In [3]:
utils.safe_mkdir('data')
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), 
                                    (tf.TensorShape([BATCH_SIZE]), 
                                 tf.TensorShape([BATCH_SIZE, 1])))

In [4]:
def word2vec(dataset):
    iterator = dataset.make_initializable_iterator()
    center_words, target_words = iterator.get_next()
    
    #in_vector
    embed_matrix = tf.get_variable('embed_matrix', 
                                   shape = [VOCAB_SIZE, EMBED_SIZE],
                                   initializer = tf.random_uniform_initializer())
    #embedding layer
    embed = tf.nn.embedding_lookup(embed_matrix, center_words, name = 'embed')
    
    #out_vector
    #for nn.nce_loss weight shape must be [num_classes, dim]
    nce_weight = tf.get_variable('nce_weight', shape = [VOCAB_SIZE, EMBED_SIZE],
                             initializer=tf.contrib.layers.xavier_initializer())
    nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))
    #define loss function to be NCE loss function
    #NCE loss = noise contrastive loss (maximize label and minimize sample)
    loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weight,
                                         biases = nce_bias,
                                         labels = target_words,
                                         inputs = embed,
                                         num_sampled = NUM_SAMPLED,
                                         num_classes = VOCAB_SIZE,
                                         name = 'loss'))
    optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
    utils.safe_mkdir('checkpoints')
    
    saver = tf.train.Saver({'in_vec':embed_matrix, 'out_vec':nce_weight})
    
    with tf.Session() as sess:
        sess.run(iterator.initializer)
        sess.run(tf.global_variables_initializer())
        saver.restore(sess,'./checkpoints/model.ckpt')
        
        total_loss = 0.0
        writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)
        
        start = time.time()
        for index in range(NUM_TRAIN_STEPS):
            try:
                loss_batch, _ = sess.run([loss, optimizer])
                total_loss += loss_batch
                
                if (index + 1) % SKIP_STEP == 0:
                    print('Average loss at step {}: {:5.2f}'
                          .format(index+1, total_loss / SKIP_STEP))
                    total_loss = 0.0
                if (index + 1) % 150000 == 0: # 1 epoch
                    print('Time spent for 1 epoch: %f' %(time.time() - start))
                if (index + 1) % SAVE_EVERY == 0:
                    save_path = saver.save(sess, './checkpoints/model.ckpt')
            except tf.errors.OutOfRangeError:
                sess.run(iterator.initializer)
        writer.close()

In [5]:
word2vec(dataset)

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
data/text8.zip already exists
total words: 17005207
subsampled words:  4571561
Average loss at step 10000:  3.03
Average loss at step 20000:  3.02
Average loss at step 30000:  3.01
Average loss at step 40000:  3.00
Average loss at step 50000:  3.01
Average loss at step 60000:  3.01
Average loss at step 70000:  3.02
Average loss at step 80000:  3.03
Average loss at step 90000:  3.02
Average loss at step 100000:  3.01


  This is separate from the ipykernel package so we can avoid doing imports until


data/text8.zip already exists
total words: 17005207
subsampled words:  4570717
Average loss at step 110000:  3.02
Average loss at step 120000:  3.00
Average loss at step 130000:  3.00
Average loss at step 140000:  2.99
Average loss at step 150000:  2.99
Time spent for 1 epoch: 137.368876
Average loss at step 160000:  3.00
Average loss at step 170000:  2.99
Average loss at step 180000:  3.00
Average loss at step 190000:  3.00
Average loss at step 200000:  2.99
Average loss at step 210000:  2.99
data/text8.zip already exists
total words: 17005207
subsampled words:  4570330
Average loss at step 220000:  3.00
Average loss at step 230000:  2.98
Average loss at step 240000:  2.99
Average loss at step 250000:  2.98
Average loss at step 260000:  2.99
Average loss at step 270000:  2.99
Average loss at step 280000:  2.98
Average loss at step 290000:  2.99
Average loss at step 300000:  2.98
Time spent for 1 epoch: 267.262963
Average loss at step 310000:  2.98
Average loss at step 320000:  2.98
da

### Analogy Task

In [2]:
local_dest = 'data/text8.zip'

words = word2vec_utils.read_data(local_dest)
dictionary, index_dictionary = word2vec_utils.build_vocab(words, VOCAB_SIZE, VISUAL_FLD)
del words           # to save memory

In [3]:
embed_matrix = tf.get_variable('embed_matrix', 
                                   shape = [VOCAB_SIZE, EMBED_SIZE],
                                   initializer = tf.zeros_initializer())
nce_weight = tf.get_variable('nce_weight', shape = [VOCAB_SIZE, EMBED_SIZE],
                             initializer=tf.zeros_initializer())
saver = tf.train.Saver({'in_vec':embed_matrix, 'out_vec':nce_weight})
with tf.Session() as sess:
    saver.restore(sess,'./checkpoints/model.ckpt')
    in_vec = embed_matrix.eval(sess)
    out_vec = embed_matrix.eval(sess)
    print('Model restored')
    print(in_vec.shape)
    print(out_vec.shape)

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
Model restored
(50000, 200)
(50000, 200)


In [12]:
from numpy.linalg import norm

f = open('data/questions-words.txt')

corret_num = 0
total_num = 0

while True:
    line = f.readline().lower().split()
    if not line:
        break
    #New category
    if line[0] == ':':
        print(line)
        continue
    else:
        try:
            pred = (in_vec[dictionary[line[0]]] - in_vec[dictionary[line[1]]] + 
                    in_vec[dictionary[line[2]]])

            max_sim = np.argmax(np.sum((pred * in_vec) / 
                                       norm(in_vec, axis = 1, keepdims = True),1))
            if index_dictionary[max_sim] == pred:
                correct_num += 1
        except:
            continue
f.close()

[':', 'capital-common-countries']
['athens', 'greece', 'baghdad', 'iraq']
athens
baghdad
spectacled
['athens', 'greece', 'bangkok', 'thailand']
bangkok
athens
manley
['athens', 'greece', 'beijing', 'china']
athens
beijing
twister
['athens', 'greece', 'berlin', 'germany']
athens
berlin
highlight
['athens', 'greece', 'bern', 'switzerland']
athens
bern
highlight
['athens', 'greece', 'cairo', 'egypt']
athens
cairo
breakfast
['athens', 'greece', 'canberra', 'australia']
canberra
athens
mala
['athens', 'greece', 'hanoi', 'vietnam']
athens
hanoi
frigg
['athens', 'greece', 'havana', 'cuba']
athens
havana
necessity
['athens', 'greece', 'helsinki', 'finland']
athens
helsinki
skyscrapers
['athens', 'greece', 'islamabad', 'pakistan']
athens
islamabad
reston
['athens', 'greece', 'kabul', 'afghanistan']
athens
kabul
thriller
['athens', 'greece', 'london', 'england']
london
athens
crt
['athens', 'greece', 'madrid', 'spain']
madrid
athens
calorimetry
['athens', 'greece', 'moscow', 'russia']
athens
mos