In [40]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D, Embedding
from keras.layers import Input, Reshape, Dot, Add
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
# from keras.utils.vis_utils import plot_model
import keras
import keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
import random
import numpy as np
import tensorflow as tf
import gc
seed_val = 1000
random.seed(seed_val)
np.random.seed(seed_val)
tf.set_random_seed(seed_val)

In [5]:
def create_model(num_tokens, seq_length, embed_size, n_units):
    int_words_input = Input(shape=(seq_length, ), dtype='int32')
    embedding = Embedding(input_dim=num_tokens, output_dim=embed_size, input_length=seq_length)(int_words_input)

    x = GRU(128, name="first_gru")(embedding)
    x = Dense(n_units, activation='linear')(x)
    main_model = Model(int_words_input, x)
    return main_model
    

In [96]:
def create_coocur(embed_size, window_size, n_units):
    word_encoding = Input(shape=(embed_size, ), dtype='float32')
    word_pos = Input(shape=(window_size, ), dtype='float32')
    pos = Dense(2, activation="tanh")(word_pos)
    x = Concatenate(axis=1)([word_encoding, pos])
    x = Dense(n_units, activation='sigmoid')(x)
    x = Dense(embed_size, activation='linear')(x)
    main_model = Model([word_encoding, word_pos], x)
    return main_model
    

In [97]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
int_words = words_to_ints(word2int, words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowels = build_charset()
ns_unigrams = ns_sample(word2freq, word2int, int2word, .75)
n_chars = 11 + 2 
n_features = len(char2int)
batch_size = 120
embed_size = 50
skip_window = 5
seq_len = 5
embeddings = normalize(np.load('results/seq_encoding.npy'))

In [138]:
def generate(int_words, int2word, char2tup, embeddings, seq_length, batch_size,n_chars, n_cons, n_vows):
    embed_size = embeddings.shape[1]
    ci = 0
    while True:
        batch_inputs = np.ndarray((batch_size, seq_length), dtype=np.int32)
        batch_output = np.ndarray((batch_size, embed_size), dtype=np.float32)
        for i in range(batch_size):
            seq, ci = get_context_words(int_words, ci, ci+seq_length + 1)
            ci = ci+1
            target = seq[-1]
            seq = seq[:seq_length]
            batch_inputs[i] = seq
            batch_output[i] = embeddings[target]
        yield batch_inputs, batch_output
        
def gen_co(int_words, int2word, embeddings, batch_size, window_size):
    assert batch_size % window_size == 0
    embed_size = embeddings.shape[1]
    ci = 0
    indexes = list(range(window_size * 2 + 1))
    indexes.pop(window_size)
    while True:
        batch_inputs = np.ndarray((batch_size, embed_size), dtype=np.float32)
        batch_poses = np.ndarray((batch_size, window_size * 2 + 1), dtype=np.float32)
        batch_output = np.ndarray((batch_size, embed_size), dtype=np.float32)
        for i in range(0, batch_size, window_size):
            seq, ci = get_context_words(int_words, ci, window_size * 2 + 1)
            target = seq[window_size]
            np.random.shuffle(indexes)
            rand_indexs = indexes[:window_size]
#             seq = seq[rand_indexs]
            for j in range(window_size):
                batch_inputs[i+j] = embeddings[seq[rand_indexs[j]]]
                pos_vec = np.zeros((window_size * 2 + 1,), dtype=np.int32)
                pos_vec[rand_indexs[j]] = 1
                batch_poses[i+j] = pos_vec
                batch_output[i+j] = embeddings[target]
            ci = ci+1
        yield [batch_inputs, batch_poses], batch_output

In [139]:
def evaluate(final_embedding, word2int, embed_size):
    gensim = GensimWrapper('data/news.txt', embed_size, 0, log=False)
    gensim.set_embeddings(word2int, final_embedding)
    result = gensim.evaluate()
    for key in result:
        print("{0}: {1:.2f}%".format(key, result[key]), end=' ')
    print()

In [148]:
def cosine_loss(yTrue, yPred):
    loss = K.sum(1 - K.dot(K.l2_normalize(yTrue) , K.l2_normalize(yPred)))
    return loss

In [140]:
# gen = generate(int_words, int2word, char2tup, embeddings, seq_len, batch_size, n_chars, n_consonant, n_vowels)
gen = gen_co(int_words, int2word, embeddings, batch_size, skip_window)

In [153]:
# model = create_model(len(vocab), seq_length=seq_len, embed_size=embed_size, n_units=embeddings.shape[1])
# adam = keras.optimizers.Nadam(0.01)
# model.compile(optimizer=adam, loss='mse', metrics=['mse'])
# model.summary()
try:
    del model
    keras.backend.clear_session()
    gc.collect()
except:
    pass
model = create_coocur(embed_size, skip_window * 2 + 1, 128)
adam = keras.optimizers.Nadam(0.001)
model.compile(optimizer=adam, loss="cosine_proximity")
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 11)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 2)            24          input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 52)           0           input_1[0][0]                    
                                                                 dense_1[0][0]                    
__________

In [154]:
n_batches = len(words) // batch_size
history = model.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)

Epoch 1/2
Epoch 2/2


In [None]:
lm_embeddings = model.layers[1].get_weights()[0]

In [None]:
evaluate(lm_embeddings, word2int, embed_size)

In [None]:
util = Utils(embedding=normalize(lm_embeddings), int2word=int2word, word2int=word2int)

In [None]:
util.sorted_sim("ነበር")