In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D
from keras.layers import Input, Reshape, Dot, Add
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
# from keras.utils.vis_utils import plot_model
import keras
import keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
import random
import numpy as np
import tensorflow as tf
seed_val = 1000
random.seed(seed_val)
np.random.seed(seed_val)
tf.set_random_seed(seed_val)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
def seq2sem(embed_size):
    syntax_input = Input(shape=(embed_size, ), name="syn")
    x = Dense(256, activation='tanh')(syntax_input)
    x = Dense(256, activation='relu')(x)
    x = Dense(256, activation='tanh')(x)
    x = Dense(embed_size, activation='linear')(x)
    model = Model(syntax_input, x)
    return model

In [None]:
def autoencoder(embed_size, input_size):
    word_input = Input(shape=(input_size,), name="word_input")
    x = Dense(400, activation='tanh')(word_input)
    x = Dense(256, activation='tanh')(x)
    embed = Dense(128, activation="tanh")(x)
    x = Dense(256, activation='tanh')(embed)
    x = Dense(400, activation='tanh')(x)
    x = Dense(input_size, activation='tanh')(x)
    model = Model(word_input, x)
    em_model = Model(word_input, embed)
    return model, em_model

In [None]:
def conv_model(n_input, n_output, n_enc_units, n_dec_units):
    root_word_input = Input(shape=(13, 309, 1), name="root_word_input")
    
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(root_word_input)
    x = MaxPooling2D(1, 1)(x)
    X = Dropout(.2)(x)
    x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(2, 2)(x)
    x = Flatten()(x)
#     x = Dense(300, activation='relu')(x)
    X = Dropout(.2)(x)
    state_h = Dense(n_dec_units, activation='linear')(x)
    
    decoder_inputs = Input(shape=(None, 309), name="target_word_input")
    decoder_gru = GRU(n_dec_units, return_sequences=True, return_state=True, name="decoder_gru")
    decoder_outputs, _= decoder_gru(decoder_inputs, initial_state=state_h)
    
    decoder_dense = Dense(309, activation='softmax', name="train_output")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = Model([root_word_input, decoder_inputs], decoder_outputs)
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_dec_units,))
    decoder_outputs, state_h= decoder_gru(decoder_inputs, initial_state=decoder_state_input_h)

    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs, decoder_state_input_h], [decoder_outputs, state_h])

    return model, encoder_model, decoder_model


In [21]:
def conv_model_multi(n_chars, n_consonant, n_vowels, n_units):
    root_word_input = Input(shape=(n_chars, (n_consonant + n_vowels), 1), name="root_word_input")
    
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(root_word_input)
    x = MaxPooling2D(2, 2)(x)
#     x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
#     x = MaxPooling2D(2, 2)(x)
#     x = Conv2D(8, (5, 5), padding='same', activation='relu')(x)
#     x = MaxPooling2D(2, 2)(x)
    x = Flatten()(x)
#     x = Dense(300, activation='relu')(x)
#     X = Dropout(.2)(x)
    state_h = Dense(n_units, activation='linear')(x)
    
    consonant_decoder_inputs = Input(shape=(None, n_consonant), name="target_consonant")
    consonant_decoder_gru = GRU(n_units, return_sequences=True, return_state=True,  name="consonant_decoder_gru")
    consonant_decoder_outputs, _= consonant_decoder_gru(consonant_decoder_inputs, initial_state=state_h)
    
    vowel_decoder_inputs = Input(shape=(None, n_vowels), name="vowel_input")
    vowel_decoder_gru = GRU(n_units, return_sequences=True, return_state=True, name="vowl_decoder_gru")
    vowel_decoder_outputs, _= vowel_decoder_gru(vowel_decoder_inputs, initial_state=state_h)
#     print(vowel_decoder_outputs.shape, consonant_decoder_outputs.shape)
#     x = Concatenate(axis=1)([vowel_decoder_outputs, consonant_decoder_outputs])
#     print(x.shape)
    consonant_decoder_dense = Dense(n_consonant, activation='softmax', name="consonant_output")
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_dense = Dense(n_vowels, activation='softmax', name="vowel_output")
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    main_model = Model([root_word_input, consonant_decoder_inputs, vowel_decoder_inputs], [consonant_decoder_outputs, vowel_decoder_outputs])
    
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_units,))
    
    consonant_decoder_outputs, state_h= consonant_decoder_gru(consonant_decoder_inputs, initial_state=decoder_state_input_h)
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_outputs, state_h= vowel_decoder_gru(vowel_decoder_inputs, initial_state=decoder_state_input_h)
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    decoder_model = Model([consonant_decoder_inputs, vowel_decoder_inputs, decoder_state_input_h], [consonant_decoder_outputs, vowel_decoder_outputs, state_h])

    return main_model, encoder_model, decoder_model


In [None]:
def dense_nce(input_size, embed_size):
    context_word = Input(shae=(128,), name="context_input")
    target_word = Input(shae=(128,), name="target_input")
    context_vec = Dense(embed_size)(context_word)
    target_vec = Dense(embed_size)(target_word)
    

In [None]:
def conv_model_multi_v2(n_chars, n_char_class, n_consonant, n_vowels, n_units):
    root_word_input = Input(shape=(n_chars, (n_consonant + n_vowels), 1), name="root_word_input")
    
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(root_word_input)
    x = MaxPooling2D(1, 1)(x)
    X = Dropout(.2)(x)
    x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(1, 1)(x)
    x = Flatten()(x)
#     x = Dense(300, activation='relu')(x)
#     X = Dropout(.2)(x)
    state_h = Dense(n_units, activation='linear')(x)
    
    consonant_decoder_inputs = Input(shape=(None, n_consonant), name="target_consonant")
    consonant_decoder_gru = GRU(n_units, return_sequences=True, return_state=True, name="consonant_decoder_gru")
    consonant_decoder_outputs, _= consonant_decoder_gru(consonant_decoder_inputs, initial_state=state_h)
    
    vowel_decoder_inputs = Input(shape=(None, n_vowels), name="vowel_input")
    vowel_decoder_gru = GRU(n_units, return_sequences=True, return_state=True, name="vowl_decoder_gru")
    vowel_decoder_outputs, _= vowel_decoder_gru(vowel_decoder_inputs, initial_state=state_h)
    
    decoders_outputs = Concatenate(axis=1)([consonant_decoder_outputs, vowel_decoder_outputs])
  
    decoder_dense = Dense(n_char_class, activation='softmax', name="decoder_output")
    decoders_outputs = decoder_dense(decoders_outputs)
    
    main_model = Model([root_word_input, consonant_decoder_inputs, vowel_decoder_inputs], decoders_outputs)
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_units,))
    
    consonant_decoder_outputs, state_h= consonant_decoder_gru(consonant_decoder_inputs, initial_state=decoder_state_input_h)
    vowel_decoder_outputs, state_h= vowel_decoder_gru(vowel_decoder_inputs, initial_state=decoder_state_input_h)
    
    decoders_outputs = Concatenate(axis=1)([consonant_decoder_outputs, vowel_decoder_outputs])
    decoders_outputs = decoder_dense(decoders_outputs)
    
    decoder_model = Model([consonant_decoder_inputs, vowel_decoder_inputs, decoder_state_input_h], [decoders_outputs, state_h])
    
    return main_model, encoder_model, decoder_model


In [None]:
def conv_model2(n_input, n_output, n_enc_units, n_dec_units):
    root_word_input = Input(shape=(13, 309, 1), name="root_word_input")
    word_feature = Input(shape=(128,), name="word_feature")
    
    x = Conv2D(32, (3, 3), padding='same', activation='relu')(root_word_input)
    x = MaxPooling2D(2, 2)(x)
    X = Dropout(.2)(x)
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(2, 2)(x)
    x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(2, 2)(x)
    x = Flatten()(x)
    x = Dense(128, activation='linear')(x)
#     X = Dropout(.2)(x)
    x = Add()([x, word_feature])
    state_h = Dense(n_dec_units, activation='linear')(x)
    
    decoder_inputs = Input(shape=(None, 309), name="target_word_input")
    decoder_gru = GRU(n_dec_units, return_sequences=True, return_state=True, name="decoder_gru")
    decoder_outputs, _= decoder_gru(decoder_inputs, initial_state=state_h)
    
    decoder_dense = Dense(309, activation='softmax', name="train_output")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = Model([root_word_input, decoder_inputs, word_feature], decoder_outputs)
    encoder_model = Model([root_word_input, word_feature], state_h)
    
    decoder_state_input_h = Input(shape=(n_dec_units,))
    decoder_outputs, state_h= decoder_gru(decoder_inputs, initial_state=decoder_state_input_h)

    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs, decoder_state_input_h], [decoder_outputs, state_h])

    return model, encoder_model, decoder_model

In [None]:
def decode_sequence(model, int2char, state):
    target_seq = np.zeros([1, 1, 309])
    target_seq[0, 0, char2int['&']] = 1
    decoded_chars = []
    for i in range(13):
        target_seq, state = model.predict([target_seq, state])
        index = np.argmax(target_seq.flatten())
        char = int2char[index]
        decoded_chars += [char]
        
#         target_seq = np.zeros([1, 1, 309])
#         target_seq[0, 0, index] = 1
    return decoded_chars


In [None]:
def one_hot_sep(con, con_max, vow, vow_max):
    con_vec = np.zeros((con_max, ))
    con_vec[con] = 1
    vow_vec = np.zeros((vow_max, ))
    vow_vec[vow] = 1
    return con_vec, vow_vec
    
def decode_multi_sequence(model, char2tup, tup2char, state, n_consonant, n_vowels):
    con, vow = char2tup['&']
    con_vec, vow_vec = one_hot_sep(con, n_consonant, vow, n_vowels) 
    con_vec = con_vec.reshape((1, 1, -1))
    vow_vec = vow_vec.reshape((1, 1, -1))
#     target_seq = np.concatenate([con_vec, vow_vec])
    decoded_chars = []
    for i in range(13):
        con_vec, vow_vec, state = model.predict([con_vec, vow_vec, state])
#         target_seq = np.concatenate([con_vec, vow_vec])
        new_con_vec = np.zeros_like(con_vec)
        new_con_vec[0, 0, np.argmax(con_vec[0, 0, :])] = 1
        new_vow_vec = np.zeros_like(vow_vec)
        new_vow_vec[0, 0, np.argmax(vow_vec[0, 0, :])] = 1
        con_vec, vow_vec = new_con_vec, new_vow_vec
        name = "{0}-{1}".format(np.argmax(con_vec[0, 0, :]), np.argmax(vow_vec[0, 0, :]))
        try:
            char = tup2char[name]
        except:
            char = ' '
        decoded_chars += [char]
    
    return decoded_chars

In [None]:
def embedding_model(input_size, output_size, embed_size):
    context_word = Input(shape=(input_size,), name="context_word")
    x = Dense(256, activation='relu')(context_word)
    embeding = Dense(embed_size, activation='tanh')(x)
    target_word = Dense(output_size, activation='relu')(embeding)
    model = Model(context_word, target_word)
    em_model = Model(context_word, embeding)
    return model, em_model

In [None]:
def cosine_loss(yTrue, yPred):
    loss = K.sum(K.square(yTrue - yPred))
    return loss

In [None]:
def embedding_model2(input_size, output_size, embed_size):
    context_word = Input(shape=(input_size,), name="context_word")
    target_word = Input(shape=(input_size,), name="target_word")
    
    layer1 = Dense(200, activation='tanh')
    layer2 = Dense(200, activation='tanh')
    
    x = layer1(context_word)
    y = layer1(target_word)
#     y = layer2(y)
    cosine_sim = Dot(normalize=True, axes=1)([x, y])
#     z = Concatenate(axis=1)([x, y])
#     z = Dense(20, activation='tanh')(z)
    
#     output = Dense(1, activation='tanh')(z)
    model = Model([context_word, target_word], cosine_sim)
    
    con_model = Model(context_word, x)
    tar_model = Model(target_word, y)
    
    return model, con_model, tar_model

In [None]:
def embeder3(input_size, embed_size):
    context_word = Input(shape=(input_size,), name="context_word")
    
    x = Dense(embed_size, activation='tanh')(context_word)
    y = Dense(embed_size, activation='tanh')(x)
    
    model = Model(context_word, y)
    em_model = Model(context_word, x)
    return model, em_model

In [None]:
def embedding_model3(input_size, output_size, embed_size):
    left_word = Input(shape=(input_size,), name="left_word")
    right_word = Input(shape=(input_size,), name="right_word")
    
    layer1 = Dense(128, activation='relu')
    layer2 = Dense(128, activation='linear')
    
    left = layer1(left_word)
    right = layer1(right_word)
    
    left = layer2(left)
    right = layer2(right)
    
    x = Concatenate(axis=1)([left, right])
    x = Dense(embed_size, activation='tanh')(x)
    
    model = Model([left_word, right_word], x)
    con_model = Model(left_word, left)
    tar_model = Model(right_word, right)
    
    return model, con_model, tar_model

In [3]:
def evaluate(final_embedding, word2int, embed_size):
    gensim = GensimWrapper(embed_size, 0, log=False)
    gensim.set_embeddings(word2int, final_embedding)
    result = gensim.evaluate()
    for key in result:
        print("{0}: {1:.2f}%".format(key, result[key]), end=' ')
    print()

In [None]:
def normalize(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

def normalize2(embeddings):
    maxes = np.max(np.abs(embeddings), axis=1, keepdims=True)
    return embeddings / maxes


In [4]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
int_words = words_to_ints(word2int, words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()
ns_unigrams = ns_sample(word2freq, word2int, int2word, .75)
n_chars = 11 + 2 
n_features = len(char2int)
batch_size = 120
embed_size = 128
skip_window = 5

In [None]:
gen = generate_word_images_flat(vocab, char2tup, batch_size, n_chars, n_consonant, n_vowel)
# x, y = next(gen)
model, em_model = autoencoder(embed_size, n_chars*(n_consonant + n_vowel))
adam = keras.optimizers.Nadam(0.0001)
model.compile(optimizer=adam, loss='mse')
n_batches = len(vocab) // batch_size
model.summary()

In [None]:
history = model.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)

In [None]:
word_in = np.ndarray((len(vocab), 13 * (n_consonant + n_vowel)))
for i in range(len(vocab)):
    con_vec, vow_vec = word2vec_seperated(
                char2tup, words[i], n_chars, n_consonant, n_vowel)
    word_in[i] = np.concatenate([con_vec, vow_vec], axis=1).flatten()
indexes = [word2int[word] for word in vocab]
embeddings = em_model.predict(word_in[indexes])


In [None]:
evaluate(normalize(embeddings), word2int, embed_size)

In [22]:
multi_gen = generate_word_images_multi(words, char2tup, batch_size, n_consonant, n_vowel)
# multi_gen = generate_word_images_multi_v2(words, char2int, char2tup, batch_size, n_consonant, n_vowel)

# [x1, x2, x3], y = next(multi_gen)
# print(y.shape)

In [23]:
keras.backend.clear_session()
multi_train, multi_enc, multi_dec = conv_model_multi(n_chars, n_consonant, n_vowel, embed_size)
adam = keras.optimizers.Nadam(0.001)
multi_train.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['acc'])
multi_train.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
root_word_input (InputLayer)    (None, 13, 50, 1)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 13, 50, 16)   160         root_word_input[0][0]            
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 6, 25, 16)    0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 2400)         0           max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
target_con

In [24]:
# multi_train2, multi_enc2, multi_dec2 = conv_model_multi_v2(n_chars, len(char2int), n_consonant, n_vowel, embed_size)
# adam = keras.optimizers.Nadam(0.001)
# multi_train2.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['acc'])
# multi_train2.summary()

In [25]:
n_batches = len(vocab) // batch_size
history = multi_train.fit_generator(multi_gen, steps_per_epoch=n_batches, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
train_vocab = vocab[:int(len(vocab) * .7)]
test_vocab = vocab[int(len(vocab) * .7):]
# gen = generate_word_images_feat(vocab, word2int, char2int, sem_embed, batch_size)
# gen2 = generate_batch_image_v3(words, word2int, char2int, batch_size, skip_window)

In [None]:
sem_emb = np.load('results/word2vec_embedding.npy')
seq_emb = np.load('results/char_embedding.npy')
# def gen_mapping(data, seq, sem, batch_size):
#     ci = 0
#     while True:
#         batch_indexes, ci = get_context_words(data, ci, batch_size)
#         ci += batch_size
#         batch_inputs = seq[batch_indexes]
#         batch_labels = sem[batch_indexes]
#         yield batch_inputs, batch_labels

# data = [word2int[word] for word in train_vocab]
# full = [word2int[word] for word in vocab]
# gen = gen_mapping(full, sem_emb, seq_emb, batch_size)

In [None]:
model = seq2sem(embed_size)
adam = keras.optimizers.Nadam(0.0001)
model.compile(optimizer=adam, loss='mse')
model.summary()

In [None]:
n_batches = len(vocab) // batch_size
adam = keras.optimizers.Nadam(0.0001)
history = model.fit_generator(gen, steps_per_epoch=n_batches, epochs=3)

In [None]:
test_indexs = [word2int[word] for word in test_vocab]
train_indexs = [word2int[word] for word in train_vocab]
# pred = np.random.randn(sem_emb.shape[0], sem_emb.shape[1])
# pred[train_indexs] = sem_emb[train_indexs]
# pred[test_indexs] = model.predict(seq_emb[test_indexs])
pred = model.predict(sem_emb)
# dot_prods = np.einsum('ij,ij->i', normalize(sem_emb), normalize(pred))
# print(dot_prods[:10])

In [None]:

def parseVec(file, delimiter):
    lines = open(file, encoding='utf8').readlines()
    vocab_size, embed_size = [int(s) for s in lines[0].split()]
    embeddings = np.ndarray((vocab_size, embed_size), dtype=np.float32)
    for i in range(vocab_size):
        line = lines[i+1].split(delimiter)[:-1]
        word = line[0]
        if word in word2int:
            wordvec = np.array([float(j) for j in line[1:]])
            embeddings[word2int[word]] = wordvec
    return embeddings
em = parseVec('results/model.vec', ' ')
em2 = parseVec('results/vec.txt', '\t')

In [None]:
# embeddings = normalize(np.load('results/wi.npy'))
utils = Utils(embedding=normalize(embeddings), word2int=word2int, int2word=int2char)
utils.sorted_sim("ኢትዮጵያ")

In [27]:
# seq_norm = normalize(seq_emb)
# sem_norm = normalize(sem_emb)
# e = np.concatenate([sem_emb, embeddings], axis=1)
evaluate(embed_size=embed_size,final_embedding=normalize(embeddings), word2int=word2int)

  if np.issubdtype(vec.dtype, np.int):


anomaly: 62.39% semantic: 0.54% syntactic: 16.67% 


In [None]:
train, infenc, infdec = conv_model2(13, 13, embed_size, embed_size)
adam = keras.optimizers.Nadam(0.001)
# train.compile(optimizer=adam, loss='mse')
train.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['acc'])
# train.summary()

In [None]:
n_batches = len(vocab) // batch_size
history = train.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)

In [None]:
infdec.save('models/decoder.h5')

In [None]:
def pred_embeddings(vocab, infenc):
    embeddings = np.ndarray((len(vocab), embed_size))
    i = 0
    buffer = []
    buffer_size = 10000
    embed = []
    for wi, word in enumerate(vocab):
        word = int2word[word2int[word]]
        buffer.append(word2vec(char2int, word, 13))
        embed.append(sem_embed[word2int[word]])
        if len(buffer) == buffer_size or len(vocab) - wi < buffer_size:
            buffer_np = np.stack(buffer).reshape((-1, 13, 309, 1))
            embed = np.stack(embed)
            result = infenc.predict([buffer_np, embed])
            embeddings[i:i+len(buffer)] = result
            i += len(buffer)
            buffer = []
            embed = []
            if i % (4 *buffer_size) == 0:
                print("Predicting: {0:.2f}%".format((i * 100.0 / len(vocab))))
    print("finished")
    return embeddings

embeddings = pred_embeddings(vocab, infenc)

In [26]:
def pred_embeddings_multi(vocab, encoder, char2tup):
    embeddings = np.ndarray((len(vocab), embed_size))
    i = 0
    buffer = []
    buffer_size = 10000
    for wi, word in enumerate(vocab):
        word = int2word[word2int[word]]
        convec, vowvec = word2vec_seperated(char2tup, word, n_chars, n_consonant, n_vowel)
        mat = np.concatenate([convec, vowvec], axis=1)
        buffer.append(mat)
        if len(buffer) == buffer_size or len(vocab) - wi < buffer_size:
            buffer_np = np.stack(buffer).reshape((-1, n_chars, (n_consonant + n_vowel), 1))
            result = encoder.predict(buffer_np)
            embeddings[i:i+len(buffer)] = result
            i += len(buffer)
            buffer = []
            if i % (4 *buffer_size) == 0:
                print("Predicting: {0:.2f}%".format((i * 100.0 / len(vocab))))
    print("finished")
    return embeddings

embeddings = pred_embeddings_multi(vocab, multi_enc, char2tup)

Predicting: 13.43%
Predicting: 26.86%
Predicting: 40.29%
Predicting: 53.72%
Predicting: 67.15%
Predicting: 80.58%
Predicting: 94.01%
finished


In [None]:

# for i in range(len(test_vocab)):
#     word = test_vocab[i]
#     result = decode_sequence(infdec, int2char, embeddings[word2int[word]].reshape((1, -1)))
#     result = ''.join(result).strip()#[1:-1]
#     print(word, result)
#     if i ==  10:
#         break
rand_vocab = np.random.choice(vocab, 10)
for i in range(10):
    word = rand_vocab[i]
    result = decode_multi_sequence(multi_dec, char2tup, tup2char, 
                                   embeddings[word2int[word]].reshape((1, -1)), n_consonant, n_vowel)
    result = ''.join(result).strip()#[1:-1]
    print(word, result)
    if i ==  10:
        break

In [None]:
embedding_normal = normalize(embeddings)

In [None]:
evaluate(embedding_normal, embed_size=embed_size, word2int=word2int)

In [None]:
np.save("results/char_embedding", embeddings)

In [None]:
sentenses = open('data/news.txt', encoding='utf-8').read().split('*')
sentenses = [s.strip().split() for s in sentenses]


In [None]:
model = gensim.models.Word2Vec(sentenses, 
                            size=128, 
                            iter=20, 
                            min_count=1,
                            negative=10,
                            sg=1,
                            seed=seed_val
                            )

In [None]:
result1 = model.accuracy('data/syntax.txt')
result2 = model.accuracy('data/semantic.txt')

In [None]:
sem_embed = np.ndarray((len(vocab), 128))
for voc in vocab:
    if voc is not '*':
        sem_embed[word2int[voc]] = model.wv[voc]
np.save("results/word2vec_embedding", sem_embed)

In [None]:
sem_embed_normal = normalize(sem_embed)
# emb_norm = normalize(embeddings)
# ee = sem_embed + embeddings*.1366
# full_embed = np.concatenate([5*sem_embed_normal, 2*emb_norm], axis=1)
full_embed_normal = sem_embed_normal + embedding_normal

In [None]:
full_embed_normal = normalize(embedding_normal)

In [None]:
evaluate(full_embed_normal, word2int, embed_size=full_embed_normal.shape[1])

In [None]:
emu_model, emu_pred = embeder3(128, 128)
adam = keras.optimizers.Nadam(0.0001)
sgd = keras.optimizers.SGD(.01)
emu_model.compile(optimizer=adam, loss='mse')
batch_size = 500
skip_window = 5
gen4 = generate4(int_words, embeddings, word2int, batch_size, skip_window)

In [None]:
n_batches = len(words) // batch_size
history = emu_model.fit_generator(gen4, steps_per_epoch=n_batches, epochs = 4)

In [None]:
myembedding_norm=emu_model.predict(embedding_normal)

In [None]:
myembeddings = normalize(myembedding_norm)
# sem_embed_normal = normalize(sem_embed)
ee = myembeddings #+ sem_embed_normal
evaluate(ee, word2int, embed_size=myembeddings.shape[1])

In [None]:
# min = 1
# b = embedding_normal[0]
# for i in range(len(embedding_normal)):
#     a = embedding_normal[i]
#     d = a.dot(b)
#     if d < min:
#         print(d)
#         min = d
# embedding_normal[0]

In [None]:
def get_batch_words(words, start, length):
    if start + length > len(words):
        end = start + length - len(words)
        return words[start:] + words[0:end], end
    else:
        end = start + length
        return words[start:end], end

def get_context_words(words, start, length):
    if start + length > len(words):
        start = 0
    end = start + length
    return words[start:end], start


In [None]:
def generate(data, embeds, word2int, int2word, unigrams, batch_size, skip_window):
    embed_szie = embeds.shape[1]
    assert batch_size % skip_window == 0
    ci = skip_window  # current_index
    batch_y = np.ones(shape=(batch_size, 1), dtype=np.float32)
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        shuffle_index = np.random.shuffle(np.arange(batch_size))
        for batch_index in range(0, batch_size, skip_window * 2):  # fill the batch inputs
            context = data[ci - skip_window:ci + skip_window + 1]
            # remove the target from context words
            target = context.pop(skip_window)
            # context = random.sample(context, skip_window * 2)
            word_index = 0
            for b in range(batch_index, batch_index + skip_window * 2):
                con_vec = embeds[word2int[context[word_index]]]
                target_vec = embeds[word2int[target]]
                batch_inputs[b] = con_vec
                batch_labels[b] = target_vec
                word_index += 1

            ci += 1
        if len(data) - ci - skip_window < batch_size:
            ci = skip_window
        for ri  in range(0, batch_size, 2):
            batch_labels[ri] = embeds[np.random.randint(len(embeds))]
            batch_y[ri][0] = batch_labels[ri].dot(batch_inputs[ri])
#         print(batch_labels.shape)
#         batch_labels = batch_labels[shuffle_index].reshape((-1, 128))
# #         print(batch_labels.shape)
#         batch_inputs = batch_inputs[shuffle_index].reshape((-1, 128))
#         batch_y = batch_y[shuffle_index].reshape((-1, 1))
        yield [batch_inputs, batch_labels], batch_y


In [None]:
def generate2(data, embeds, word2int, batch_size, skip_window):
    embed_size = embeds.shape[1]
    assert batch_size % skip_window == 0
    ci = skip_window  # current_index
    input_width = embed_size * 2 * skip_window
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, input_width), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        for batch_index in range(batch_size):  # fill the batch inputs
            context = data[ci - skip_window:ci + skip_window + 1]
            target = context.pop(skip_window)
#             print(context, target)
            context_vec = []
            target_vec = embeds[word2int[target]]
            for word in context:
                con_vec = embeds[word2int[word]]
                context_vec.append(con_vec)
            context_vec = np.hstack(context_vec)
#             batch_inputs[batch_index] = context_vec
#             batch_labels[batch_index] = target_vec
            
            ci += 1
        if len(data) - ci - skip_window < batch_size:
            ci = skip_window
        yield batch_inputs, batch_labels


In [None]:
def generate3(data, embeds, word2int, batch_size, skip_window):
    embed_size = embeds.shape[1]
    ci = 0  # current_word_index
    input_width = embed_size * 2 * skip_window
    while True:
        batch_inputs_left = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_inputs_right = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        for batch_index in range(batch_size):  # fill the batch inputs
            context, ci = get_context_words(data, ci, 3)
            ci = ci + 1
            left_word_vec = embeds[word2int[context[0]]]
            target_vec = embeds[word2int[context[1]]]
            right_word_vec = embeds[word2int[context[2]]]
            batch_inputs_left[batch_index] = left_word_vec
            batch_inputs_right[batch_index] = right_word_vec
            batch_labels[batch_index] = target_vec

        yield [batch_inputs_left, batch_inputs_right], batch_labels


In [None]:
def generate4(data, embeds, word2int, batch_size, skip_window):
    embed_size = embeds.shape[1]
    ci = 0  # current_word_index
    input_width = embed_size
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        for batch_index in range(0, batch_size, skip_window):  # fill the batch inputs
            context, ci = get_context_words(data, ci, skip_window * 2 + 1)
            ci = ci + 1
            target = context.pop(skip_window)
            context = np.random.choice(context, skip_window)
            context_vec = embeds[context[0]]
            target_vec = embeds[context[1]]
            
            batch_inputs[batch_index:batch_index +
                             skip_window] = embeddings[context]
            batch_labels[batch_index:batch_index +
                             skip_window] = embeddings[target]

        yield batch_inputs, batch_labels


In [None]:
gen3 = generate3(words, embedding_normal, word2int, batch_size, 3)

In [None]:
window = 3
semantic_batch_size = 120
input_size = 128
# gg =  generate2(words, embeds_norm, word2int, batch_size=semantic_batch_size, skip_window=window) 

In [None]:
train_model, con_model, tar_model = embedding_model3(input_size, 128, embed_size)
adam = keras.optimizers.Nadam(0.0001)
train_model.compile(optimizer=adam, loss="mse")
train_model.summary()

In [None]:
n_batches = len(words) // semantic_batch_size
history = train_model.fit_generator(gen3, steps_per_epoch=n_batches, epochs = 2)

In [None]:
g = generate(words, embedding_normal, word2int, int2word, ns_unigrams, batch_size=semantic_batch_size, skip_window=3)
[a, b], y = next(g)
print(y)

In [None]:
input_size = 128#window * 2* embed_size
# em_train, em_out = embedding_model(input_size, 128, embed_size)
# adam = keras.optimizers.Nadam(lr=0.002)
# em_train.compile(optimizer=adam, loss='mean_squared_error')
# em_train.summary()

In [None]:
train_model, con_model, tar_model = embedding_model2(input_size, 128, embed_size)
adam = keras.optimizers.SGD(0.001)
train_model.compile(optimizer=adam, loss="mse", metrics=['mse', 'acc'])
train_model.summary()

In [None]:
n_batches = len(words) // semantic_batch_size
history = train_model.fit_generator(g, steps_per_epoch=n_batches, epochs = 2)

In [None]:
context_vecs = []
for i_word in range(len(vocab)):
    word = int2word[i_word]
    context_vecs.append(embedding_normal[word2int[word]])
context_vecs = np.stack(context_vecs)
context_embed = con_model.predict(context_vecs)
# target_embed = tar_model.predict(context_vecs)
# em = context_embed + target_embed

In [None]:
em_normal = normalize(full_embed)
evaluate(em_normal, word2int, embed_size=em_normal.shape[1])

In [None]:
utils = Utils(word2int,int2word, embedding_normal)
# v = -em_normal[word2int['ገንዘብ']] + em_normal[word2int['ብር']]
# dots = em_normal.dot(v).flatten()
# int2word[np.argmax(dots)]
utils.sorted_sim("ዶላር")
# utils.sorted_sim("ብር")

In [None]:
con_input = [
    embedding_normal[word2int['ነበር']].reshape((1, 128)),
    embedding_normal[word2int['ነው']].reshape((1, 128)),
]
vec = train_model.predict(con_input).flatten()
print(embedding_normal[word2int['ነበር']].dot(embedding_normal[word2int['ነው']]))
int2word[np.argmax(embedding_normal.dot(vec))]

In [None]:
min = 1
b = em_normal[0]
for i in range(len(em_normal)):
    a = em_normal[i]
    d = a.dot(b)
    if d < min:
        print(d)
        min = d
# embedding_normal[0]

In [None]:
# semantic = em_out.predict(embeds_norm)
# gensim = GensimWrapper(embed_size, 0, log=True)
# embeds = embeds.reshape((-1, 128))
# norms = np.linalg.norm(semantic, axis=1, keepdims=True)
# semantic_norm = semantic / norms
vecs = []
discovered = {}
for i in range(window, len(words) - window):
    context = words[i - window: i + window + 1]
    target = context.pop(window)
    if target not in discovered:
        discovered[target] = len(discovered)
        c_vec = []
        for cword in context:
            vec = embeds_norm[word2int[cword]]
            c_vec.append(vec)
        context_vec = np.hstack(c_vec)
        vecs.append(context_vec)
    if len(discovered) == len(vocab):
        print("discovered")
        break
    
semantic = np.stack(vecs).reshape(-1, input_size)
print(len(vecs), embeds_norm.shape)
assert semantic.shape[0] == embeds_norm.shape[0]
# semantic = em_out.predict(embeds_norm)
# gensim = GensimWrapper(embed_size, 0, log=True)
# embeds = embeds.reshape((-1, 128))
# norms = np.linalg.norm(semantic, axis=1, keepdims=True)
# semantic_norm = semantic / norms
