In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D, Embedding
from keras.layers import Input, Reshape, Dot, Add
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
# from keras.utils.vis_utils import plot_model
import keras
import keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
import random
import numpy as np
import tensorflow as tf
seed_val = 1000
random.seed(seed_val)
np.random.seed(seed_val)
tf.set_random_seed(seed_val)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def create_model(num_tokens, seq_length, embed_size, n_units):
    int_words_input = Input(shape=(seq_length, ), dtype='int32')
    embedding = Embedding(input_dim=num_tokens, output_dim=embed_size, input_length=seq_length)(int_words_input)

    x = GRU(128, name="first_gru")(embedding)
    x = Dense(n_units, activation='linear')(x)
    main_model = Model(int_words_input, x)
    return main_model
    

In [4]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
int_words = words_to_ints(word2int, words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowels = build_charset()
ns_unigrams = ns_sample(word2freq, word2int, int2word, .75)
n_chars = 11 + 2 
n_features = len(char2int)
batch_size = 120
embed_size = 100
skip_window = 5
seq_len = 50
embeddings = np.load('results/seq_encoding.npy')

In [5]:
def generate(int_words, int2word, char2tup, embeddings, seq_length, batch_size,n_chars, n_cons, n_vows):
    embed_size = embeddings.shape[1]
    ci = 0
    while True:
        batch_inputs = np.ndarray((batch_size, seq_length), dtype=np.int32)
        batch_output = np.ndarray((batch_size, embed_size), dtype=np.float32)
        for i in range(batch_size):
            seq, ci = get_context_words(int_words, ci, ci+seq_length + 1)
            ci = ci+1
            target = seq[-1]
            seq = seq[:seq_length]
            batch_inputs[i] = seq
            batch_output[i] = embeddings[target]
        yield batch_inputs, batch_output


In [6]:
def evaluate(final_embedding, word2int, embed_size):
    gensim = GensimWrapper('data/news.txt', embed_size, 0, log=False)
    gensim.set_embeddings(word2int, final_embedding)
    result = gensim.evaluate()
    for key in result:
        print("{0}: {1:.2f}%".format(key, result[key]), end=' ')
    print()

In [7]:
gen = generate(int_words, int2word, char2tup, embeddings, seq_len, batch_size, n_chars, n_consonant, n_vowels)

In [8]:
model = create_model(len(vocab), seq_length=seq_len, embed_size=embed_size, n_units=embeddings.shape[1])
adam = keras.optimizers.Nadam(0.01)
model.compile(optimizer=adam, loss='mse', metrics=['mse'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 100)           29783600  
_________________________________________________________________
first_gru (GRU)              (None, 128)               87936     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6450      
Total params: 29,877,986
Trainable params: 29,877,986
Non-trainable params: 0
_________________________________________________________________


In [None]:
n_batches = len(vocab) // batch_size
history = model.fit_generator(gen, steps_per_epoch=n_batches, epochs = 12)

Epoch 1/12

In [None]:
lm_embeddings = model.layers[1].get_weights()[0]

In [None]:
evaluate(lm_embeddings, word2int, embed_size)

In [None]:
util = Utils(embedding=normalize(lm_embeddings), int2word=int2word, word2int=word2int)

In [None]:
util.sorted_sim("ነበር")