In [None]:
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import

In [None]:
import sys, os, io
import numpy as np
import pandas as pd

In [None]:
from gensim.models import Word2Vec

In [None]:
from keras import backend as K
from keras.models import load_model

In [None]:
wd = "[Work Directory]"
os.chdir(wd)

In [None]:
from model_template.losses import squared_error
from model_template.noise_layers import GumbelNoise

## configuration

### path to the original embedding

In [None]:
path_w2v = "[path to the original word2vec embedding by gensim]"

### path to the output model

In [None]:
path_out_model = "[path to the output model by Keras]"

## load w2v model and keras model

In [None]:
model_w2v = Word2Vec.load(path_w2v)

In [None]:
model = load_model(path_out_model, custom_objects={"squared_error":squared_error, "GumbelNoise":GumbelNoise})

## SandBox

### Encoder

In [None]:
input_layer = model.get_layer(name="input_x")
output_layer = model.get_layer(name="gumbel_softmax")

In [None]:
N_m = len(output_layer._outbound_nodes)

In [None]:
input_tensor = [input_layer.input, K.learning_phase()]
output_tensor = [output_layer.get_output_at(i) for i in range(N_m)]

In [None]:
encoder_core = K.function(inputs=input_tensor, outputs=output_tensor)
def encoder(vec_original):
    PREDICTION_PHASE = 0
    if vec_original.ndim == 1:
        vec_original = np.expand_dims(vec_original, axis=0)
    lst_vec_w_enc = encoder_core(inputs=[vec_original, PREDICTION_PHASE])
    w_enc = np.array(map(np.argmax, lst_vec_w_enc)) # apply argmax for all vectors
    return w_enc

### Decoder

In [None]:
lst_decoder_layer = [model.get_layer(name="decoder_%d" % i) for i in range(N_m)]

In [None]:
arry_decoder_params = np.array([layer.get_weights()[0] for layer in lst_decoder_layer])

In [None]:
def decoder(vec_encoded):
    ret = np.zeros(arry_decoder_params.shape[-1])
    for m, k in enumerate(vec_encoded):
        ret += arry_decoder_params[m,k]
    return ret

## test with arbitrary wordset

In [None]:
WORDS = "男,女,王,女王"

In [None]:
lst_word = WORDS.split(",")
lst_word = [word for word in lst_word if word in model_w2v]

### dispaly original embedding

In [None]:
lst_vec_w = [model_w2v[word] for word in lst_word]
pd.DataFrame(index=lst_word, data=lst_vec_w)

### encode specified words into code-book representation

In [None]:
lst_vec_w_enc = [encoder(vec_w) for vec_w in lst_vec_w]
pd.DataFrame(index=lst_word, data=lst_vec_w_enc)

### decode(=restore) code-book representation

In [None]:
lst_vec_w_dec = [decoder(vec_w_enc) for vec_w_enc in lst_vec_w_enc]

### How similar between original embedding and restored ones?

In [None]:
def cosine_similarity(vec_x, vec_y):
    vec_x /= np.sqrt(np.sum(vec_x**2))
    vec_y /= np.sqrt(np.sum(vec_y**2))
    return np.sum(vec_x*vec_y)

In [None]:
lst_sim = [cosine_similarity(vec_w, vec_w_dec) for vec_w, vec_w_dec in zip(lst_vec_w, lst_vec_w_dec)]

In [None]:
pd.DataFrame(index=lst_word, data={"similarity":lst_sim})

### can we compute word similarity in code-book dimension?

In [None]:
def discrete_similarity(vec_x, vec_y):
    n_dim = vec_x.size
    n_match = (vec_x == vec_y).sum()
    return n_match / n_dim

In [None]:
from itertools import product

In [None]:
n_word = len(lst_word)
mat_sim = np.array([discrete_similarity(vec_w1, vec_w2) for vec_w1, vec_w2 in product(lst_vec_w_enc, lst_vec_w_enc)]).reshape((n_word, n_word))

In [None]:
df_ = pd.DataFrame(index=lst_word, data=mat_sim)
df_.columns = lst_word
df_

### compare with word similarity in original space...

In [None]:
mat_w = np.vstack(lst_vec_w)
mat_sim = mat_w.dot(mat_w.T)
df_ = pd.DataFrame(index=lst_word, data=mat_sim)
df_.columns = lst_word
df_