In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D
from keras.layers import Input, Reshape
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
# from keras.utils.vis_utils import plot_model
import keras
from data_handle import *
from gensim_wrapper import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def conv_model(n_input, n_output, n_enc_units, n_dec_units):
    root_word_input = Input(shape=(13, 309, 1), name="root_word_input")
    
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(root_word_input)
    x = MaxPooling2D(2, 2)(x)
    x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(2, 2)(x)
    
    x = Flatten()(x)

    state_h = Dense(n_dec_units, activation='relu')(x)
    
    decoder_inputs = Input(shape=(None, 309), name="target_word_input")
    decoder_gru = GRU(n_dec_units, return_sequences=True, return_state=True, name="decoder_gru")
    decoder_outputs, _= decoder_gru(decoder_inputs, initial_state=state_h)
    
    decoder_dense = Dense(309, activation='softmax', name="train_output")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = Model([root_word_input, decoder_inputs], decoder_outputs)
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_dec_units,))
    decoder_outputs, state_h= decoder_gru(decoder_inputs, initial_state=decoder_state_input_h)

    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs, decoder_state_input_h], [decoder_outputs, state_h])

    return model, encoder_model, decoder_model


In [4]:
def embedding_model(embed_size):
    context_word = Input(shape=(embed_size,), name="context_word")
    embeding = Dense(embed_size, activation='tanh')(context_word)
    target_word = Dense(embed_size, activation='relu')(embeding)
    model = Model(context_word, target_word)
    em_model = Model(context_word, embeding)
    return model, em_model

In [5]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()

n_chars = 11 + 2
n_features = len(char2int)
batch_size = 128
embed_size = 128

In [6]:
gen = generate_word_images(vocab, char2int, batch_size)

In [7]:
train, infenc, infdec = conv_model(13, 13, embed_size, embed_size)
train.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [8]:
train.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
root_word_input (InputLayer)    (None, 13, 309, 1)   0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 13, 309, 16)  160         root_word_input[0][0]            
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 6, 154, 16)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 6, 154, 8)    1160        max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
max_poolin

In [None]:
n_batches = len(vocab) // batch_size
history = train.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)

Epoch 1/2


In [None]:
embeddings = []
word2int = {}
i = 0
print(len(vocab))
for word in vocab:
    word2int[word] = len(word2int)
    vec = word2vec(char2int, word, 13).reshape((1, 13, 309, 1))
    emb = infenc.predict(vec)
    embeddings.append(emb)
    if i % 10000 == 0:
        print(i * 100.0 / len(vocab))
    i+=1

In [None]:
gensim = GensimWrapper(embed_size, 0, log=True)
embeds = embeds.reshape((-1, 128))
norms = np.linalg.norm(embeds, axis=1, keepdims=True)
embeds = embeds / norms

In [None]:
gensim.set_embeddings(word2int, embeds)
gensim.evaluate()

In [None]:
from utils import *

utils = Utils(word2int, embeds)

In [None]:
utils.sorted_sim("ኢትዮጵያን")

In [None]:
def generate(data, embeds, word2int, batch_size, skip_window):
    embed_szie = embeds.shape[1]
    assert batch_size % skip_window == 0
    ci = skip_window  # current_index
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        for batch_index in range(0, batch_size, skip_window * 2):  # fill the batch inputs
            context = data[ci - skip_window:ci + skip_window + 1]
            # remove the target from context words
            target = context.pop(skip_window)
            # context = random.sample(context, skip_window * 2)
            word_index = 0
            for b in range(batch_index, batch_index + skip_window * 2):
                con_vec = embeds[word2int[context[word_index]]]
                target_vec = embeds[word2int[target]]
                batch_inputs[b] = con_vec
                batch_labels[b] = target_vec
                word_index += 1

            ci += 1
        if len(data) - ci - skip_window < batch_size:
            ci = skip_window
        yield batch_inputs, batch_labels


In [None]:
g = generate(words, embeds, word2int, batch_size=120, skip_window=5)

In [None]:
em_train, em_out = embedding_model(embed_size)
adam = keras.optimizers.Nadam(lr=0.002)
em_train.compile(optimizer=adam, loss='mean_squared_error')
em_train.summary()

In [None]:
n_batches = len(words) // 120
history = em_train.fit_generator(g, steps_per_epoch=n_batches, epochs = 2)

In [None]:
semantic = em_out.predict(embeds)
gensim = GensimWrapper(embed_size, 0, log=True)
embeds = embeds.reshape((-1, 128))
norms = np.linalg.norm(semantic, axis=1, keepdims=True)
semantic = semantic / norms

In [None]:
gensim.set_embeddings(word2int, semantic)
gensim.evaluate()