In [4]:
from keras.models import Model
from keras.layers import Embedding, Dense, Input, Reshape, Flatten, dot, Add
import numpy as np
import keras.backend as K
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import itertools
import tensorflow as tf

Using TensorFlow backend.


In [7]:
debiased_probs = np.load('debiased_matrix.npy')

In [240]:
vocab_size = len(debiased_probs)
# vocab_size = 22000
vector_dim = 100
batch_size = 32
eps = 0.0001

In [54]:
# generate batch data
def generate_batch(batch_size):
    indices = np.random.choice(vocab_size,batch_size,replace=False)
    real_prods = debiased_probs[np.ix_(indices,indices)]
    return indices, real_prods


### Tests

#### Generate Batch

Validate that this method of generate the batches works as expected

X_train: random word indices for which we will update the weights/vectors

Y_train: the expected normalized dot product between the vectors (see create_debiased_matrix)

In [22]:
x_train = np.random.choice(10,5,replace=False)

In [23]:
x_train

array([9, 0, 2, 5, 6])

In [21]:
probs = np.arange(100).reshape(10,10)
probs

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [24]:
y_train = probs[np.ix_(x_train,x_train)]
y_train

array([[99, 90, 92, 95, 96],
       [ 9,  0,  2,  5,  6],
       [29, 20, 22, 25, 26],
       [59, 50, 52, 55, 56],
       [69, 60, 62, 65, 66]])

#### Negative Sampling

Verify approximation of matrix values using negative sampling

Just to be safe, we'll use a words for which the probabilities should not have been altered (appropriately genedered words).

In [156]:
p_i = wiki_model.wv.vocab['prince'].index
p_j = wiki_model.wv.vocab['queen'].index

In [157]:
p_i

822

In [158]:
p_j

1002

In [159]:
debiased_probs[p_i,p_j]

9.312238735049531e-06

In [161]:
np.log(debiased_probs[p_i,p_j])

-11.584181029948848

The above will be the value that we're aiming for. From the paper: "We define Negative sampling
(NEG) by the objective ... (see equation) which is used to replace every log P(wO|wI ) term in the Skip-gram objective"

Negative sampling approximation:

In [219]:
i_vec = wiki_model.wv.vectors[p_i]

In [220]:
j_vec = wiki_model.wv.vectors[p_j]

In [225]:
def sigmoid(x):
    return 1./(1.+(np.exp(-x)))

In [226]:
np.dot(i_vec,j_vec)

605.4208

In [227]:
first_term = np.log(sigmoid(np.dot(i_vec,j_vec)))

In [228]:
first_term

0.0

From the paper, "Our experiments indicate that values
of k in the range 5–20 are useful for small training datasets, while for large datasets the k can be as
small as 2–5."

In [237]:
second_term = 0.
k=20
while k > 0:
    r_i = np.random.randint(vocab_size)
    if r_i != p_i:
        r_vec = wiki_model.wv.vectors[r_i]
        new_val = np.log(sigmoid(np.dot((-1*r_vec),j_vec)) + eps)
        print(new_val)
        second_term += new_val
        k -= 1
        

-9.210340371976182
-9.21033786527982
-8.364109242160499
-0.14256124519539537
-9.210340371976182
9.99950002835643e-05
9.999500033329732e-05
9.999496631480276e-05
-9.210340371976182
9.999500033041103e-05
-0.01238183780225174
-9.210340371976182
-9.13753330352398
6.223938614676579e-05
-9.210340371976182
-9.210340371976182
-8.254129579052506
-9.210340371976182
-9.210340371975958
-9.210340226760936


In [238]:
second_term

-118.01365405623122

In [239]:
first_term + second_term

-118.01365405623122

**Not a good approximation!**

In [5]:
wiki_model = Word2Vec.load("english-wikipedia-articles-20170820-models/enwiki_2017_08_20_fasttext.model")

In [92]:
original_weights = wiki_model.wv.vectors[:vocab_size,:]


In [12]:
original_weights.shape

(22000, 100)

In [241]:
graph = tf.Graph()

with graph.as_default():
    with tf.name_scope('inputs'):
        x_train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        y_train_inputs = tf.placeholder(tf.float32, shape=[batch_size,batch_size])
    with tf.device('/cpu:0'):
        with tf.name_scope('embeddings'):
            # initialize the the biased embedding weights
            embeddings = tf.Variable(original_weights)
        embed = tf.nn.embedding_lookup(embeddings,x_train_inputs)
        dot_prod = tf.tensordot(embed,tf.transpose(embed),1)
        dot_prod_norm = tf.sqrt(tf.reduce_sum(tf.square(dot_prod), 1, keepdims=True))
        dot_prod_normed = dot_prod / dot_prod_norm
        pred_vals = tf.log(tf.sigmoid(dot_prod_normed) + eps)
        indices = np.arange(batch_size)
        for i in indices:
            for j in indices:
                indices_tensor = tf.convert_to_tensor([[m,j] for m in np.where(indices!=i)[0]])
#                 print('dot_prod shape: {}'.format(dot_prod.get_shape().as_list()))
#                 print('indices_tensor shape: {}'.format(indices_tensor.get_shape().as_list()))
                neg_vals = tf.gather_nd(dot_prod_normed,indices_tensor,name='gather_neg_indices')
                neg_vals_flipped = -1*neg_vals
                neg_sig = tf.sigmoid(neg_vals_flipped)
                neg_samples = tf.log(neg_sig + eps)
#                 print('neg_samples shape: {}'.format(neg_samples))
                neg_samples_sum = tf.reduce_sum(neg_samples,keepdims=True)
#                 print('neg_samples_sum shape: {}'.format(neg_samples_sum))
#                 print('neg_samples_sum type: {}'.format(type(neg_samples_sum)))
                delta = tf.SparseTensor([[i,j]],neg_samples_sum,[batch_size,batch_size])
                pred_vals = pred_vals + tf.sparse_tensor_to_dense(delta)
    with tf.name_scope('loss'):
        log_val = tf.log(y_train_inputs + eps)
        loss = tf.reduce_sum(tf.abs(log_val - pred_vals))
    
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()     

In [152]:
xt, yt = generate_batch(batch_size)

In [153]:
feed_dict = {x_train_inputs: xt, y_train_inputs: yt}
with tf.Session(graph=graph) as sess:
    init.run()
    res = sess.run(loss, feed_dict)
    print(res)

151823.03


**Note the exceptionally high loss before training...**

In [None]:
num_steps = 100001

with tf.Session(graph=graph) as session:
# Open a writer to write summaries.

# We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size)
        feed_dict = {x_train_inputs: batch_inputs, y_train_inputs: batch_labels}

      # Define metadata variable.
        run_metadata = tf.RunMetadata()

        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
        average_loss += loss_val

      # Add returned summaries to writer in each step.
      # Add metadata to visualize the graph for the last run.

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
        # The average loss is an estimate of the loss over the last 2000
        # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

    final_embeddings = normalized_embeddings.eval()

    # Save the model for checkpoints.
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))

Initialized
('Average loss at step ', 0, ': ', 14717.12109375)
('Average loss at step ', 2000, ': ', 14841.26538330078)
('Average loss at step ', 4000, ': ', 14850.408685058594)
('Average loss at step ', 6000, ': ', 14862.140109863281)
('Average loss at step ', 8000, ': ', 14877.74851123047)
('Average loss at step ', 10000, ': ', 14889.898091308594)
('Average loss at step ', 12000, ': ', 14896.11196875)
('Average loss at step ', 14000, ': ', 14908.872233398437)
('Average loss at step ', 16000, ': ', 14923.580012695313)
('Average loss at step ', 18000, ': ', 14936.589776367187)
('Average loss at step ', 20000, ': ', 14942.658600585937)
('Average loss at step ', 22000, ': ', 14961.238532714844)
('Average loss at step ', 24000, ': ', 14969.888884765625)
('Average loss at step ', 26000, ': ', 14978.608094726562)
('Average loss at step ', 28000, ': ', 14996.662852539062)
('Average loss at step ', 30000, ': ', 15009.816293457032)
('Average loss at step ', 32000, ': ', 15015.808973632813)
('A