In [173]:
from keras.models import Model
from keras.layers import Embedding, Dense, Input, Reshape, Flatten
import numpy as np
import keras.backend as K
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
debiased_probs = np.load('debiased_matrix.npy')

In [3]:
vocab_size = len(debiased_probs)
vocab_size

22000

In [69]:
x_train = np.arange(vocab_size)

In [78]:
x_train

array([    0,     1,     2, ..., 21997, 21998, 21999])

In [79]:
y_train = debiased_probs.T

In [80]:
vector_dim = 100

In [143]:
K.clear_session()
input_word = Input((1,))
embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')(input_word)
flatten = Flatten()(embedding)
output = Dense(vocab_size, activation='softmax')(flatten)


In [144]:
model = Model(input=input_word, output=output)


  """Entry point for launching an IPython kernel.


In [145]:
model.compile(optimizer='rmsprop', loss='kld', metrics=['accuracy'])

In [146]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding (Embedding)        (None, 1, 100)            2200000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 22000)             2222000   
Total params: 4,422,000
Trainable params: 4,422,000
Non-trainable params: 0
_________________________________________________________________


In [179]:
model.fit(x_train, y_train,
          epochs=50, batch_size=32)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0xc23507f98>

In [180]:
model.save('model_100_epochs.h5')

In [181]:
model.save_weights('model_100_epochs_weights.h5')

In [182]:
weights = np.array(model.get_weights()[0])

In [149]:
pred = model.predict([0])

In [151]:
pred[0].sum()

0.9999999

In [142]:
y_train[0].sum()

0.9999999999999896

In [123]:
wiki_model = Word2Vec.load("english-wikipedia-articles-20170820-models/enwiki_2017_08_20_fasttext.model")


In [184]:
with open('debiased_embedding.txt', 'w') as we:
    we.write('{} {}\n'.format(vocab_size,vector_dim))
    for i in range(vocab_size):
        w = wiki_model.wv.index2word[i]
        vec = weights[i]
        we.write('{} '.format(w))
        for v in vec:
            we.write(str(v) + ' ')
        we.write('\n')

In [185]:
model_debiased = KeyedVectors.load_word2vec_format('debiased_embedding.txt', binary=False)


In [187]:
model_debiased.wv.save_word2vec_format('debiased_model_100.bin', binary=True)


  """Entry point for launching an IPython kernel.


In [191]:
wiki_model.wv.save_word2vec_format('biased_model_full.txt', binary=False)


In [189]:
! wc -l 'debiased_embedding.txt'

   22001 debiased_embedding.txt


In [195]:
!head -n 22001 'biased_model_full.txt' > 'biased_model.txt'

In [196]:
model_biased = KeyedVectors.load_word2vec_format('biased_model.txt', binary=False)


In [197]:
model_biased.wv.save_word2vec_format('fast_text_small.bin', binary=True)


  """Entry point for launching an IPython kernel.


In [198]:
model_biased.most_similar(positive=['woman', 'king'], negative=['man'])               


[('queen', 0.7756180763244629),
 ('monarch', 0.7246657609939575),
 ('princess', 0.7197414040565491),
 ('prince', 0.7065383195877075),
 ('empress', 0.6887034177780151),
 ('regent', 0.6676155924797058),
 ('consort', 0.6602832078933716),
 ('marriage', 0.6249816417694092),
 ('constantine', 0.6138389110565186),
 ('emperor', 0.6067585945129395)]

In [199]:
model_debiased.most_similar(positive=['woman', 'king'], negative=['man'])               


[('isabella', 0.7068111300468445),
 ('deposed', 0.6896980404853821),
 ('constantine', 0.687233567237854),
 ('sigismund', 0.6663229465484619),
 ('prince', 0.6559014320373535),
 ('regent', 0.654486894607544),
 ('crowned', 0.6519841551780701),
 ('philip', 0.6483753323554993),
 ('iii', 0.6431190371513367),
 ('ferdinand', 0.6400526762008667)]

In [200]:
model_biased.most_similar(positive=['baghdad', 'england'], negative=['london'])               


[('mosul', 0.7500249147415161),
 ('syria', 0.7329857349395752),
 ('iraq', 0.7099663019180298),
 ('yemen', 0.703923761844635),
 ('libya', 0.6781851649284363),
 ('afghanistan', 0.6734411716461182),
 ('iraqi', 0.6579854488372803),
 ('aleppo', 0.654990017414093),
 ('tripoli', 0.6461622714996338),
 ('damascus', 0.64577716588974)]

In [201]:
model_debiased.most_similar(positive=['baghdad', 'england'], negative=['london'])               


[('syrian', 0.6814754009246826),
 ('governorate', 0.6789872646331787),
 ('arab', 0.6772698163986206),
 ('mahmoud', 0.6739144921302795),
 ('jordanian', 0.6720398664474487),
 ('masjid', 0.6697014570236206),
 ('sunni', 0.6684684753417969),
 ('wal', 0.6664949655532837),
 ('amr', 0.6647167205810547),
 ('amin', 0.6625057458877563)]

In [202]:
model_biased.most_similar(positive=['woman', 'doctor'], negative=['man'])               


[('psychiatrist', 0.6780833601951599),
 ('nurse', 0.6779659986495972),
 ('dentist', 0.6075595617294312),
 ('teacher', 0.6025106906890869),
 ('psychologist', 0.5949655771255493),
 ('mistress', 0.5942846536636353),
 ('physician', 0.5904124975204468),
 ('counselor', 0.5747247338294983),
 ('tutor', 0.5684062838554382),
 ('professor', 0.5608705282211304)]

In [203]:
model_debiased.most_similar(positive=['woman', 'doctor'], negative=['man'])               


[('scientist', 0.6410385370254517),
 ('psychiatrist', 0.6191622614860535),
 ('dr', 0.6027106046676636),
 ('dean', 0.5591259598731995),
 ('physician', 0.5469638109207153),
 ('prof', 0.5324706435203552),
 ('professor', 0.5117653608322144),
 ('researcher', 0.5073388814926147),
 ('psychologist', 0.5047338604927063),
 ('loren', 0.5018652081489563)]