In [3]:
from keras.models import Model
from keras.layers import Embedding, Dense, Input, Reshape, Flatten, dot
import numpy as np
import keras.backend as K
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import itertools
import tensorflow as tf

Using TensorFlow backend.


In [4]:
debiased_probs = np.load('debiased_matrix.npy')

In [5]:
vocab_size = len(debiased_probs)
# vocab_size = 22000

In [6]:
vector_dim = 100

In [7]:
input_size = 22

In [8]:
if not input_size or vocab_size % input_size:
    print('ERROR: input size must be divisible by vocab size')

In [9]:
xs = np.arange(vocab_size)
x_train = np.array([xs[i * input_size:(i + 1) * input_size] for i in range((len(xs) + input_size - 1) // input_size )])


In [10]:
x_train.shape

(1000, 22)

In [11]:
y_train = np.array([debiased_probs[x[0]:x[-1]+1,x[0]:x[-1]+1] for x in x_train])


In [12]:
y_train.shape

(1000, 22, 22)

In [13]:
wiki_model = Word2Vec.load("english-wikipedia-articles-20170820-models/enwiki_2017_08_20_fasttext.model")



1. change this so it takes in a batch (either by passing in batches or changing the dimension of the input)

2. Have the output be the dot product of the embeddings of the batch (EX: if the list of words is x, and the embedding of each of the words is X, then the output is C = X dot X)

3. Have the loss function accumulate the loss for each value normalized with negative sampling (double four loop)

In [14]:
K.clear_session()
input_i = Input((input_size,))
embedding = Embedding(vocab_size, vector_dim, input_length=input_size, name='embedding')
vec_i = embedding(input_i)
vec_i = Reshape((vector_dim, input_size))(vec_i)
dot_product = dot([vec_i, vec_i], axes=1)
# dot_product = Reshape((1,))(dot_product)
output = Dense(1, activation='linear')(dot_product)

In [15]:
model = Model(input=[input_i], output=output)


  """Entry point for launching an IPython kernel.


In [18]:
def custom_loss(y_pred,y_true):
    loss = 0
    indices = np.arange(input_size)
    for i in indices:
        for j in indices:
            indices_tensor = tf.convert_to_tensor([[0,m,j] for m in np.where(indices!=i)[0]])
#             print(tf.shape(indices_tensor))
#             print(tf.shape(y_pred))
#             print(sess.run(indices_tensor[0]))
            neg_samples = K.sigmoid(-1*tf.gather_nd(y_pred,indices_tensor))
            neg_sample_sum = K.sum(K.log(neg_samples))
            curr_ind = tf.convert_to_tensor([[0,i,j]])
            pred = K.log(K.sigmoid(tf.gather_nd(y_pred,curr_ind))) + neg_sample_sum
            loss += (K.log(y_true) - pred)
    return loss

In [19]:
model.compile(optimizer='rmsprop', loss=custom_loss, metrics=['accuracy'])


In [20]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 22)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 22, 100)      2200000     input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 100, 22)      0           embedding[0][0]                  
__________________________________________________________________________________________________
dot_1 (Dot)                     (None, 22, 22)       0           reshape_1[0][0]                  
                                                                 reshape_1[0][0]                  
__________

In [21]:
original_weights = wiki_model.wv.vectors[:vocab_size,:]


In [22]:
layer_dict = dict([(layer.name, layer) for layer in model.layers])
layer_dict['embedding'].set_weights([original_weights])

In [23]:
model.save('untrained_reconstruction_model.h5')

In [24]:
model.save_weights('untrained_reconstruction_model_weights.h5')

In [25]:
model.fit(x_train,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1b4f0e3f28>

In [26]:
model.save('trained_reconstruction_model.h5')

In [27]:
model.save_weights('trained_reconstruction_model_weights.h5')

In [28]:
weights = np.array(model.get_weights()[0])

In [29]:
with open('debiased_embedding.txt', 'w') as we:
    we.write('{} {}\n'.format(vocab_size,vector_dim))
    for i in range(vocab_size):
        w = wiki_model.wv.index2word[i]
        vec = weights[i]
        we.write('{} '.format(w))
        for v in vec:
            we.write(str(v) + ' ')
        we.write('\n')

In [30]:
model_debiased = KeyedVectors.load_word2vec_format('debiased_embedding.txt', binary=False)


In [187]:
# model_debiased.wv.save_word2vec_format('debiased_model_100.bin', binary=True)


  """Entry point for launching an IPython kernel.


In [191]:
# wiki_model.wv.save_word2vec_format('biased_model_full.txt', binary=False)


In [195]:
# !head -n 22001 'biased_model_full.txt' > 'biased_model.txt'

In [196]:
# model_biased = KeyedVectors.load_word2vec_format('biased_model.txt', binary=False)


In [197]:
# model_biased.wv.save_word2vec_format('fast_text_small.bin', binary=True)


  """Entry point for launching an IPython kernel.


In [198]:
# model_biased.most_similar(positive=['woman', 'king'], negative=['man'])               


[('queen', 0.7756180763244629),
 ('monarch', 0.7246657609939575),
 ('princess', 0.7197414040565491),
 ('prince', 0.7065383195877075),
 ('empress', 0.6887034177780151),
 ('regent', 0.6676155924797058),
 ('consort', 0.6602832078933716),
 ('marriage', 0.6249816417694092),
 ('constantine', 0.6138389110565186),
 ('emperor', 0.6067585945129395)]

In [31]:
model_debiased.most_similar(positive=['woman', 'king'], negative=['man'])               


[('queen', 0.7746890187263489),
 ('monarch', 0.7205826640129089),
 ('princess', 0.7195833325386047),
 ('prince', 0.7071539759635925),
 ('empress', 0.6903455257415771),
 ('regent', 0.6701014041900635),
 ('consort', 0.6536059975624084),
 ('marriage', 0.6225910186767578),
 ('constantine', 0.6129751801490784),
 ('emperor', 0.6064161658287048)]

In [200]:
# model_biased.most_similar(positive=['baghdad', 'england'], negative=['london'])               


[('mosul', 0.7500249147415161),
 ('syria', 0.7329857349395752),
 ('iraq', 0.7099663019180298),
 ('yemen', 0.703923761844635),
 ('libya', 0.6781851649284363),
 ('afghanistan', 0.6734411716461182),
 ('iraqi', 0.6579854488372803),
 ('aleppo', 0.654990017414093),
 ('tripoli', 0.6461622714996338),
 ('damascus', 0.64577716588974)]

In [32]:
model_debiased.most_similar(positive=['baghdad', 'england'], negative=['london'])               


[('mosul', 0.7513368129730225),
 ('syria', 0.7309308052062988),
 ('iraq', 0.7116736173629761),
 ('yemen', 0.7084739208221436),
 ('libya', 0.676697850227356),
 ('afghanistan', 0.6735158562660217),
 ('iraqi', 0.6566969156265259),
 ('aleppo', 0.651322066783905),
 ('tripoli', 0.6423690915107727),
 ('damascus', 0.6417893171310425)]

In [202]:
# model_biased.most_similar(positive=['woman', 'doctor'], negative=['man'])               


[('psychiatrist', 0.6780833601951599),
 ('nurse', 0.6779659986495972),
 ('dentist', 0.6075595617294312),
 ('teacher', 0.6025106906890869),
 ('psychologist', 0.5949655771255493),
 ('mistress', 0.5942846536636353),
 ('physician', 0.5904124975204468),
 ('counselor', 0.5747247338294983),
 ('tutor', 0.5684062838554382),
 ('professor', 0.5608705282211304)]

In [33]:
model_debiased.most_similar(positive=['woman', 'doctor'], negative=['man'])               


[('psychiatrist', 0.6768365502357483),
 ('nurse', 0.6762781143188477),
 ('dentist', 0.601752758026123),
 ('teacher', 0.6015279293060303),
 ('psychologist', 0.5945874452590942),
 ('mistress', 0.5942332148551941),
 ('physician', 0.5875041484832764),
 ('counselor', 0.5739505290985107),
 ('tutor', 0.5650268793106079),
 ('professor', 0.5599614381790161)]