In [399]:
from keras.models import Model
from keras.layers import Embedding, Dense, Input, Reshape, Flatten, dot, Add
import numpy as np
import keras.backend as K
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import itertools
import tensorflow as tf
import sys

In [416]:
debiased_mat_alt = np.load('debiased_matrix_alt_orig.npy')

In [417]:
vocab_size = len(debiased_mat_alt_2)
# vocab_size = 22000
vector_dim = 100
batch_size = 8
eps = 0.0001
k=20


In [418]:
wiki_model = Word2Vec.load("english-wikipedia-articles-20170820-models/enwiki_2017_08_20_fasttext.model")

In [419]:
gender_word_pairs = [('he','she'),('man','woman'),('his','her'),('himself','herself'), ('him','her'),('men','women'),('husband','wife'),('girl','boy'),('men','women'),('brother','sister'),('mother','father'),('aunt','uncle'),('grandfather','grandmother'),('son','daughter'),('waiter','waitress'),('niece','nephew')]
gender_word_pairs_simple = [w for p in gender_word_pairs for w in p] 
gw_dict = {w:i for (i,w) in enumerate(gender_word_pairs_simple)}

gender_inds = np.array([wiki_model.wv.vocab[w].index for w in gender_word_pairs_simple])


In [420]:
gender_inds

array([   13,    42,   276,   789,    17,    40,   551,  2127,    94,
          40,   402,   435,  1250,   526,  1020,  1218,   402,   435,
         627,  1077,   591,   386,  6642,  3005,  4021,  7034,   306,
         629, 21467, 20573, 10387,  5092])

In [421]:
gw_dict

{'aunt': 22,
 'boy': 15,
 'brother': 18,
 'daughter': 27,
 'father': 21,
 'girl': 14,
 'grandfather': 24,
 'grandmother': 25,
 'he': 0,
 'her': 9,
 'herself': 7,
 'him': 8,
 'himself': 6,
 'his': 4,
 'husband': 12,
 'man': 2,
 'men': 16,
 'mother': 20,
 'nephew': 31,
 'niece': 30,
 'she': 1,
 'sister': 19,
 'son': 26,
 'uncle': 23,
 'waiter': 28,
 'waitress': 29,
 'wife': 13,
 'woman': 3,
 'women': 17}

In [422]:
word_counts = np.array([wiki_model.wv.vocab[wiki_model.wv.index2word[i]].count for i in range(vocab_size)])
word_counts_power = np.power(word_counts,.75)
sampling_dist = np.true_divide(word_counts_power,np.sum(word_counts_power))

def sigmoid(x):
    return 1 / (1 + np.e ** -x)

wvs = np.load('english-wikipedia-articles-20170820-models/enwiki_2017_08_20_fasttext.model.wv.vectors.npy')
wvs = wvs[:vocab_size]

cvs = np.load('english-wikipedia-articles-20170820-models/enwiki_2017_08_20_fasttext.model.trainables.syn1neg.npy')
cvs = cvs[:vocab_size]



In [423]:
# generate batch data
def generate_batch(batch_size):
    indices_wv = np.random.choice(vocab_size,batch_size,replace=False)
    indices_cv = np.random.choice(vocab_size,batch_size,replace=False)
    y_probs = np.full((batch_size,batch_size),-1.)
    samples = np.random.choice(vocab_size,size=k,replace=False,p=sampling_dist)
    samples_2d = np.array([samples for i in range(batch_size)])
    for i,ind_i in enumerate(indices_wv):
        for j, ind_j in enumerate(indices_cv):
            if ind_j in gender_inds:
#                 ind_gw = gw_dict[wiki_model.wv.index2word[ind_j]]
#                 y_probs[i,j] = debiased_mat_alt[ind_i,ind_gw]
                y_probs[i,j] = debiased_mat_alt_2[ind_i,ind_j]

            else:
                v_prime_wo = wvs[ind_i]
                v_wi = cvs[ind_j]
                first_term = np.log(sigmoid((np.matmul(v_prime_wo,v_wi))))
                second_term = np.sum([np.log(sigmoid(-1*np.matmul(wvs[s],v_wi))) for s in samples])
                y_probs[i,j] = np.exp(first_term + second_term)
    return indices_wv, indices_cv, samples_2d, y_probs


In [424]:
indices_wv, indices_cv, samples_2d, yt = generate_batch(batch_size)

In [425]:
original_weights = wiki_model.wv.vectors[:vocab_size,:]
original_cv_weights = cvs[:vocab_size,:]

In [426]:
trained_wv_weights = np.load('wvs_embed_trained.npy')

In [427]:
trained_cv_weights = np.load('cvs_embed_trained.npy')

Per Mikolov's paper "Distributed Representations of Words and Phrases and their Compositionality," we use the following definition of each log conditional probability.

$$ \log P(w_O|w_I) \approx \log \sigma ({v'_{wo}}^T v_{wI}) + \sum_{i=1}^{k} [\log {\sigma ({{-v'_{wi}}^T v_{wI}})}] $$

https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf

In [428]:
graph = tf.Graph()

with graph.as_default():
    with tf.name_scope('inputs'):
        indices_wv_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        indices_cv_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        samples_inputs = tf.placeholder(tf.int32, shape=[batch_size,k])
        y_train_inputs = tf.placeholder(tf.float32, shape=[batch_size,batch_size])
    with tf.device('/cpu:0'):
        with tf.name_scope('embeddings'):
            # initialize the the biased embedding weights
            embeddings_wv = tf.Variable(trained_wv_weights)
            embeddings_cv = tf.Variable(trained_cv_weights)
        embed_wv = tf.nn.embedding_lookup(embeddings_wv,indices_wv_inputs)
        embed_cv = tf.nn.embedding_lookup(embeddings_cv,indices_cv_inputs)
        embed_sv = tf.nn.embedding_lookup(embeddings_wv,samples_inputs)
        prod = tf.matmul(embed_wv, tf.transpose(embed_cv))
        first_term = tf.math.log_sigmoid(prod)
        embed_sv = tf.reshape(embed_sv,[vector_dim,batch_size,k])
        prod_2 = tf.tensordot(embed_cv, -1*embed_sv, axes=[[1],[0]])
        prod_2_log_sig = tf.math.log_sigmoid(prod_2)
        second_term = tf.reduce_sum(prod_2_log_sig,2)
        pred_vals = first_term + second_term
    with tf.name_scope('loss'):
        log_val = tf.log(y_train_inputs + eps)
        loss = tf.reduce_sum(tf.abs(log_val - pred_vals))
    
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(0.0003).minimize(loss)
    
#     norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
#     normalized_embeddings = embeddings / norm
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()     

In [429]:
indices_wv, indices_cv, samples, yt = generate_batch(batch_size)

In [430]:
feed_dict = {indices_wv_inputs: indices_wv, indices_cv_inputs: indices_cv, samples_inputs: samples, y_train_inputs: yt}
with tf.Session(graph=graph) as sess:
    init.run()
    res = sess.run(loss, feed_dict)
    print(res)

167.31906


In [431]:
num_steps = 1000001

with tf.Session(graph=graph) as session:
# Open a writer to write summaries.

# We must initialize all variables before we use them.
    init.run()
    print('Initialized')
    min_loss = sys.maxsize
    average_loss = 0
    for step in range(num_steps):
        indices_wv, indices_cv, samples, yt = generate_batch(batch_size)
        feed_dict = {indices_wv_inputs: indices_wv, indices_cv_inputs: indices_cv, samples_inputs: samples, y_train_inputs: yt}



      # Define metadata variable.
        run_metadata = tf.RunMetadata()

        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
        average_loss += loss_val

      # Add returned summaries to writer in each step.
      # Add metadata to visualize the graph for the last run.

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
        # The average loss is an estimate of the loss over the last 2000
        # batches.
            print('Average loss at step ', step, ': ', average_loss)
            if average_loss < min_loss:
                print('New min loss, saving embedding')
                wvs_embed_trained = embeddings_wv.eval()

                cvs_embed_trained = embeddings_cv.eval()
                
            average_loss = 0

    wvs_embed_trained = embeddings_wv.eval()

    cvs_embed_trained = embeddings_cv.eval()

#     final_embeddings = normalized_embeddings.eval()

    # Save the model for checkpoints.
    saver.save(session, './model_0')
    

Initialized
('Average loss at step ', 0, ': ', 118.79475402832031)
New min loss, saving embedding
('Average loss at step ', 2000, ': ', 166.43611096572877)
New min loss, saving embedding
('Average loss at step ', 4000, ': ', 159.29424289131165)
New min loss, saving embedding


KeyboardInterrupt: 

In [None]:
np.save('wvs_embed_trained_3', wvs_embed_trained)

In [None]:
np.save('cvs_embed_trained_3', cvs_embed_trained)

In [None]:
wvs_embed_trained_norm_3 = np.sqrt(np.sum(np.square(wvs_embed_trained_3), 1, keepdims=True))
wvs_embed_trained_normalized_embeddings_3 = wvs_embed_trained_2 / wvs_embed_trained_norm_3


In [None]:
np.save('wvs_embed_trained_normalized_embeddings_3', wvs_embed_trained_normalized_embeddings_3)
