In [1]:
%matplotlib inline

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [3]:
import sys
sys.path.append('../examples/')
from process_data import process_data

In [4]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 2000 # how many steps to skip before reporting the loss

In [12]:
batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)

Dataset ready


In [11]:
for _ in range(5):
    c, t = batch_gen.next()
    print "center word: %s" % c
    print 'target words: %s' % t

center word: [    1  3417  3417     2     2     1     1  1782  1782   188   188     2
     2     1     1   626   626     1     1 13179 13179     2     2     4
     4    22    22    91    91   123   123   285   285    26    26   244
   244   233   233     7     7    32    32   437   437 26474 26474     2
     2   174   174  5239  5239  7729  7729  2535  2535     5     5    30
    30    96    96     2     2   290   290   603   603  4318  4318    20
    20     1     1 13179 13179 16575 16575    50    50   188   188   118
   118    47    47   360   360    20    20     1     1   385   385   243
   243    97    97    32    32  5235  5235    35    35   332   332  2725
  2725    19    19     1     1   839   839  1378  1378    28    28    33
    33  8763  8763    47    47  6487  6487    35]
target words: [[  3.41700000e+03]
 [  1.00000000e+00]
 [  2.00000000e+00]
 [  3.41700000e+03]
 [  1.00000000e+00]
 [  2.00000000e+00]
 [  1.78200000e+03]
 [  1.00000000e+00]
 [  1.88000000e+02]
 [  1.7820000

In [13]:
""" Build the graph for word2vec model and train it """
# Step 1: define the placeholders for input and output
# center_words have to be int to work on embedding lookup

# X is just one word, which can be with a batch size
X = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE], name='center_word')
Y = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE, 1], name='target_word')

In [14]:
# Step 2: define weights. In word2vec, it's actually the weights that we care about
# vocab size x embed size
# initialized to random uniform -1 to 1
embedding_matrix = tf.Variable(
    tf.random_uniform(dtype=tf.float32, minval=-1, maxval=1, shape=[VOCAB_SIZE, EMBED_SIZE]),
    name='embed_matrix')

In [15]:
# Step 3: define the inference
# get the embed of input words using tf.nn.embedding_lookup
# embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')
embedding_lookup = tf.nn.embedding_lookup(params=embedding_matrix, ids=X, name='embed_lookup')

In [18]:
# Step 4: construct variables for NCE loss
# tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
# nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5)
# bias: vocab size, initialized to 0
nce_weight = tf.Variable(
    tf.truncated_normal(dtype=tf.float32, shape=[VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / (EMBED_SIZE ** 0.5)),
    name='nce_weight')
nce_bias = tf.Variable(tf.zeros(dtype=tf.float32, shape=[VOCAB_SIZE]), name='nce_bias')

# define loss function to be NCE loss function
# tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
# need to get the mean accross the batch
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, biases=nce_bias, 
               inputs=embedding_lookup, labels=Y, 
               num_classes=VOCAB_SIZE, num_sampled=NUM_SAMPLED), name='loss')

In [19]:
# Step 5: define optimizer
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

In [20]:
with tf.Session() as sess:
    # TO DO: initialize variables
    sess.run(tf.global_variables_initializer())

    total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps
    writer = tf.summary.FileWriter('./my_graph/no_frills/', sess.graph)
    for index in xrange(NUM_TRAIN_STEPS):
        centers, targets = batch_gen.next()
        # TO DO: create feed_dict, run optimizer, fetch loss_batch
        _, loss_batch = sess.run([optimizer, loss], {X: centers, Y: targets})
        total_loss += loss_batch
        if (index + 1) % SKIP_STEP == 0:
            print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
            total_loss = 0.0
    writer.close()

Average loss at step 1999: 113.7
Average loss at step 3999:  52.8
Average loss at step 5999:  33.4
Average loss at step 7999:  23.4
Average loss at step 9999:  17.7
