In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D
from keras.layers import Input, Reshape, Dot
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
# from keras.utils.vis_utils import plot_model
import keras
import keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def conv_model(n_input, n_output, n_enc_units, n_dec_units):
    root_word_input = Input(shape=(13, 309, 1), name="root_word_input")
    
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(root_word_input)
    x = MaxPooling2D(2, 2)(x)
    x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(2, 2)(x)
    
    x = Flatten()(x)

    state_h = Dense(n_dec_units, activation='relu')(x)
    
    decoder_inputs = Input(shape=(None, 309), name="target_word_input")
    decoder_gru = GRU(n_dec_units, return_sequences=True, return_state=True, name="decoder_gru")
    decoder_outputs, _= decoder_gru(decoder_inputs, initial_state=state_h)
    
    decoder_dense = Dense(309, activation='softmax', name="train_output")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = Model([root_word_input, decoder_inputs], decoder_outputs)
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_dec_units,))
    decoder_outputs, state_h= decoder_gru(decoder_inputs, initial_state=decoder_state_input_h)

    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs, decoder_state_input_h], [decoder_outputs, state_h])

    return model, encoder_model, decoder_model


In [3]:
def embedding_model(input_size, output_size, embed_size):
    context_word = Input(shape=(input_size,), name="context_word")
    x = Dense(256, activation='relu')(context_word)
    embeding = Dense(embed_size, activation='tanh')(x)
    target_word = Dense(output_size, activation='relu')(embeding)
    model = Model(context_word, target_word)
    em_model = Model(context_word, embeding)
    return model, em_model

In [4]:
def cosine_loss(yTrue, yPred):
    loss = K.sum(K.square(yTrue - yPred))
    return loss

In [54]:
def embedding_model2(input_size, output_size, embed_size):
    context_word = Input(shape=(input_size,), name="context_word")
    target_word = Input(shape=(input_size,), name="target_word")
    layer1 = Dense(200, activation='sigmoid')
    layer2 = Dense(200, activation='linear')
    x = layer1(context_word)
#     x = layer2(x)
    
    y = layer1(target_word)
#     y = layer2(y)
    
#     cosine_sim = Dot(normalize=True, axes=1)([x, y])
    z = Concatenate(axis=1)([x, y])
#     z = Dense(100, activation='tanh')(z)
#     z = Dense(10, activation='tanh')(z)
    print(z.shape)
    
    z = Dense(20, activation='tanh')(z)
    output = Dense(1, activation='tanh')(z)
    model = Model([context_word, target_word], output)
    con_model = Model(context_word, x)
    tar_model = Model(target_word, y)
    return model, con_model, tar_model

In [6]:
def evaluate(final_embedding, word2int, embed_size):
    gensim = GensimWrapper(embed_size, 0, log=False)
    gensim.set_embeddings(word2int, final_embedding)
    result = gensim.evaluate()
    for key in result:
        print("{0}: {1:.2f}%".format(key, result[key]), end=' ')
    print()

In [7]:
def normalize(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

In [8]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()
ns_unigrams = ns_sample(word2freq, word2int, int2word, .75)
n_chars = 11 + 2 
n_features = len(char2int)
batch_size = 128
embed_size = 128

In [9]:
gen = generate_word_images(vocab, char2int, batch_size)

In [10]:
train, infenc, infdec = conv_model(13, 13, embed_size, embed_size)
train.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [11]:
n_batches = len(vocab) // batch_size
history = train.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)

Epoch 1/2
Epoch 2/2


In [12]:
embeddings = np.ndarray((len(vocab), embed_size))
i = 0
buffer = []
buffer_size = 10000
for i_word in range(len(vocab)):
    word = int2word[i_word]
    buffer.append(word2vec(char2int, word, 13))
    if len(buffer) == buffer_size or len(vocab) - i_word < buffer_size:
        buffer_np = np.stack(buffer).reshape((-1, 13, 309, 1))
        result = infenc.predict(buffer_np)
        embeddings[i:i+len(buffer)] = result
        i += len(buffer)
        buffer = []
        if i % (4 *buffer_size) == 0:
            print("Predicting: {0:.2f}%".format((i * 100.0 / len(vocab))))
print("finished")

Predicting: 13.43%
Predicting: 26.86%
Predicting: 40.29%
Predicting: 53.72%
Predicting: 67.15%
Predicting: 80.58%
Predicting: 94.01%
finished


In [20]:
embedding_normal = normalize(embeddings)

In [21]:
def get_batch_words(words, start, length):
    if start + length > len(words):
        end = start + length - len(words)
        return words[start:] + words[0:end], end
    else:
        end = start + length
        return words[start:end], end

In [60]:
def generate(data, embeds, word2int, int2word, unigrams, batch_size, skip_window):
    embed_szie = embeds.shape[1]
    assert batch_size % skip_window == 0
    ci = skip_window  # current_index
    batch_y = np.ones(shape=(batch_size, 1), dtype=np.float32)
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        shuffle_index = np.random.shuffle(np.arange(batch_size))
        for batch_index in range(0, batch_size, skip_window * 2):  # fill the batch inputs
            context = data[ci - skip_window:ci + skip_window + 1]
            # remove the target from context words
            target = context.pop(skip_window)
            # context = random.sample(context, skip_window * 2)
            word_index = 0
            for b in range(batch_index, batch_index + skip_window * 2):
                con_vec = embeds[word2int[context[word_index]]]
                target_vec = embeds[word2int[target]]
                batch_inputs[b] = con_vec
                batch_labels[b] = target_vec
                word_index += 1

            ci += 1
        if len(data) - ci - skip_window < batch_size:
            ci = skip_window
        for ri  in range(0, batch_size, 2):
#             if np.random.rand() > .95:
            batch_labels[ri] = embeds[np.random.randint(len(embeds))]
            batch_y[ri][0a] = 0
#         print(batch_labels.shape)
#         batch_labels = batch_labels[shuffle_index].reshape((-1, 128))
# #         print(batch_labels.shape)
#         batch_inputs = batch_inputs[shuffle_index].reshape((-1, 128))
#         batch_y = batch_y[shuffle_index].reshape((-1, 1))
        yield [batch_inputs, batch_labels], batch_y


In [23]:
def generate2(data, embeds, word2int, batch_size, skip_window):
    embed_size = embeds.shape[1]
    assert batch_size % skip_window == 0
    ci = skip_window  # current_index
    input_width = embed_size * 2 * skip_window
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, input_width), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        for batch_index in range(batch_size):  # fill the batch inputs
            context = data[ci - skip_window:ci + skip_window + 1]
            target = context.pop(skip_window)
#             print(context, target)
            context_vec = []
            target_vec = embeds[word2int[target]]
            for word in context:
                con_vec = embeds[word2int[word]]
                context_vec.append(con_vec)
            context_vec = np.hstack(context_vec)
#             batch_inputs[batch_index] = context_vec
#             batch_labels[batch_index] = target_vec
            
            ci += 1
        if len(data) - ci - skip_window < batch_size:
            ci = skip_window
        yield batch_inputs, batch_labels


In [61]:
window = 3
semantic_batch_size = 120
# gg =  generate2(words, embeds_norm, word2int, batch_size=semantic_batch_size, skip_window=window) 

In [62]:
g = generate(words, embedding_normal, word2int, int2word, ns_unigrams, batch_size=semantic_batch_size, skip_window=3)

In [63]:
input_size = 128#window * 2* embed_size
# em_train, em_out = embedding_model(input_size, 128, embed_size)
# adam = keras.optimizers.Nadam(lr=0.002)
# em_train.compile(optimizer=adam, loss='mean_squared_error')
# em_train.summary()

In [72]:
train_model, con_model, tar_model = embedding_model2(input_size, 128, embed_size)
adam = keras.optimizers.SGD(0.001)
train_model.compile(optimizer=adam, loss="binary_crossentropy", metrics=['mse', 'acc'])
train_model.summary()

(?, 400)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context_word (InputLayer)       (None, 128)          0                                            
__________________________________________________________________________________________________
target_word (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
dense_34 (Dense)                (None, 200)          25800       context_word[0][0]               
                                                                 target_word[0][0]                
__________________________________________________________________________________________________
concatenate_12 (Concatenate)    (None, 400)          0           dense_34[0][0]                   
 

In [73]:
n_batches = len(words) // semantic_batch_size
history = train_model.fit_generator(g, steps_per_epoch=n_batches, epochs = 2)

Epoch 1/2
 2512/34078 [=>............................] - ETA: 2:07 - loss: 0.6922 - mean_squared_error: 0.2495 - acc: 0.5667

KeyboardInterrupt: 

In [68]:
context_vecs = []
for i_word in range(len(vocab)):
    word = int2word[i_word]
    context_vecs.append(embedding_normal[word2int[word]])
context_vecs = np.stack(context_vecs)
context_embed = con_model.predict(context_vecs)
target_embed = tar_model.predict(context_vecs)
em = context_embed + target_embed

In [69]:
em_normal = normalize(context_embed)
evaluate(em_normal, word2int, embed_size=em_normal.shape[1])

  if np.issubdtype(vec.dtype, np.int):


anomaly: 66.06% semantic: 0.00% syntactic: 9.09% 


In [70]:
utils = Utils(word2int, em_normal)
utils.sorted_sim("ኢትዮጵያ")

[('ኢትዮጵያ', 1.0),
 ('ኢትዮጵያያ', 0.9999993),
 ('ኢትዮጵያዬ', 0.99999917),
 ('አትዮጵያ', 0.9999989),
 ('ኢትዮጵያስ', 0.99999666),
 ('ለኢትዮጵያ', 0.9999964),
 ('የኢትዮጵያ', 0.99999607),
 ('ዘኢትዮጵያ', 0.9999957),
 ('ኢትዮጵያም', 0.99999565),
 ('የአትዮጵያ', 0.9999956)]

In [71]:
con_input = [
    embedding_normal[word2int['ዶላር']].reshape((1, 128)),
    embedding_normal[word2int['ብር']].reshape((1, 128)),
]
train_model.predict(con_input)

array([[0.7536136]], dtype=float32)

In [None]:
# semantic = em_out.predict(embeds_norm)
# gensim = GensimWrapper(embed_size, 0, log=True)
# embeds = embeds.reshape((-1, 128))
# norms = np.linalg.norm(semantic, axis=1, keepdims=True)
# semantic_norm = semantic / norms
vecs = []
discovered = {}
for i in range(window, len(words) - window):
    context = words[i - window: i + window + 1]
    target = context.pop(window)
    if target not in discovered:
        discovered[target] = len(discovered)
        c_vec = []
        for cword in context:
            vec = embeds_norm[word2int[cword]]
            c_vec.append(vec)
        context_vec = np.hstack(c_vec)
        vecs.append(context_vec)
    if len(discovered) == len(vocab):
        print("discovered")
        break
    
semantic = np.stack(vecs).reshape(-1, input_size)
print(len(vecs), embeds_norm.shape)
assert semantic.shape[0] == embeds_norm.shape[0]
# semantic = em_out.predict(embeds_norm)
# gensim = GensimWrapper(embed_size, 0, log=True)
# embeds = embeds.reshape((-1, 128))
# norms = np.linalg.norm(semantic, axis=1, keepdims=True)
# semantic_norm = semantic / norms
