In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D
from keras.layers import Input, Reshape
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
# from keras.utils.vis_utils import plot_model
import keras
from data_handle import *
from gensim_wrapper import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def conv_model(n_input, n_output, n_enc_units, n_dec_units):
    root_word_input = Input(shape=(13, 309, 1), name="root_word_input")
    
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(root_word_input)
    x = MaxPooling2D(2, 2)(x)
    x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(2, 2)(x)
    
    x = Flatten()(x)

    state_h = Dense(n_dec_units, activation='relu')(x)
    
    decoder_inputs = Input(shape=(None, 309), name="target_word_input")
    decoder_gru = GRU(n_dec_units, return_sequences=True, return_state=True, name="decoder_gru")
    decoder_outputs, _= decoder_gru(decoder_inputs, initial_state=state_h)
    
    decoder_dense = Dense(309, activation='softmax', name="train_output")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = Model([root_word_input, decoder_inputs], decoder_outputs)
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_dec_units,))
    decoder_outputs, state_h= decoder_gru(decoder_inputs, initial_state=decoder_state_input_h)

    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs, decoder_state_input_h], [decoder_outputs, state_h])

    return model, encoder_model, decoder_model


In [60]:
def embedding_model(input_size, output_size, embed_size):
    context_word = Input(shape=(input_size,), name="context_word")
    x = Dense(256, activation='relu')(context_word)
    embeding = Dense(embed_size, activation='tanh')(x)
    target_word = Dense(output_size, activation='relu')(embeding)
    model = Model(context_word, target_word)
    em_model = Model(context_word, embeding)
    return model, em_model

In [4]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()

n_chars = 11 + 2
n_features = len(char2int)
batch_size = 128
embed_size = 128

In [5]:
gen = generate_word_images(vocab, char2int, batch_size)

In [6]:
train, infenc, infdec = conv_model(13, 13, embed_size, embed_size)
train.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [7]:
train.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
root_word_input (InputLayer)    (None, 13, 309, 1)   0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 13, 309, 16)  160         root_word_input[0][0]            
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 6, 154, 16)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 6, 154, 8)    1160        max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
max_poolin

In [8]:
n_batches = len(vocab) // batch_size
history = train.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)

Epoch 1/2
Epoch 2/2


In [9]:
embeddings = []
word2int = {}
i = 0
print(len(vocab))
for word in vocab:
    word2int[word] = len(word2int)
    vec = word2vec(char2int, word, 13).reshape((1, 13, 309, 1))
    emb = infenc.predict(vec)
    embeddings.append(emb)
    if i % 10000 == 0:
        print(i * 100.0 / len(vocab))
    i+=1

297836
0.0
3.3575524785452395
6.715104957090479
10.072657435635719
13.430209914180958
16.787762392726197
20.145314871271438
23.50286734981668
26.860419828361916
30.217972306907157
33.575524785452394
36.93307726399764
40.290629742542876
43.64818222108811
47.00573469963336
50.363287178178595
53.72083965672383
57.07839213526908
60.435944613814314
63.79349709235955
67.15104957090479
70.50860204945003
73.86615452799528
77.22370700654051
80.58125948508575
83.938811963631
87.29636444217623
90.65391692072147
94.01146939926672
97.36902187781195


In [12]:
embeds = np.stack(embeddings).reshape((-1, 128))
gensim = GensimWrapper(embed_size, 0, log=True)
norms = np.linalg.norm(embeds, axis=1, keepdims=True)
embeds_norm = embeds / norms

2018-10-24 12:05:07,865 : INFO : collecting all words and their counts
2018-10-24 12:05:07,867 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-24 12:05:07,903 : INFO : PROGRESS: at sentence #10000, processed 150787 words, keeping 38040 word types
2018-10-24 12:05:07,942 : INFO : PROGRESS: at sentence #20000, processed 309265 words, keeping 61588 word types
2018-10-24 12:05:07,987 : INFO : PROGRESS: at sentence #30000, processed 477795 words, keeping 81867 word types
2018-10-24 12:05:08,032 : INFO : PROGRESS: at sentence #40000, processed 647737 words, keeping 100652 word types
2018-10-24 12:05:08,073 : INFO : PROGRESS: at sentence #50000, processed 805941 words, keeping 116075 word types
2018-10-24 12:05:08,115 : INFO : PROGRESS: at sentence #60000, processed 967897 words, keeping 129675 word types
2018-10-24 12:05:08,163 : INFO : PROGRESS: at sentence #70000, processed 1144759 words, keeping 143321 word types
2018-10-24 12:05:08,212 : INFO : PROGRESS

In [14]:
gensim.set_embeddings(word2int, embeds_norm)
gensim.evaluate()

  if np.issubdtype(vec.dtype, np.int):
2018-10-24 12:05:49,655 : INFO : semantic word embedding: 0.0% (0/185)
2018-10-24 12:05:49,655 : INFO : total: 0.0% (0/185)
2018-10-24 12:05:50,074 : INFO : syntactic evaluation: 6.8% (9/132)
2018-10-24 12:05:50,074 : INFO : total: 6.8% (9/132)


72.4770642201835


In [17]:
from utils import *

utils = Utils(word2int, embeds_norm)

In [18]:
utils.sorted_sim("ኢትዮጵያን")

[('ኢትዮጵያን', 0.9999999),
 ('አትዮጵያን', 0.9998386),
 ('ኢትዮጵያያን', 0.9997226),
 ('ኢትዮጵያና', 0.9994155),
 ('ኢትዮጵያንን', 0.99931407),
 ('ኢትዮጵያንና', 0.99915123),
 ('ኢትዮዮጵያና', 0.9989984),
 ('ኤትዮጵያና', 0.9989674),
 ('ኢትዮጵያኖች', 0.99885225),
 ('ኢትዮጵያዊን', 0.9987637)]

In [19]:
def generate(data, embeds, word2int, batch_size, skip_window):
    embed_szie = embeds.shape[1]
    assert batch_size % skip_window == 0
    ci = skip_window  # current_index
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        for batch_index in range(0, batch_size, skip_window * 2):  # fill the batch inputs
            context = data[ci - skip_window:ci + skip_window + 1]
            # remove the target from context words
            target = context.pop(skip_window)
            # context = random.sample(context, skip_window * 2)
            word_index = 0
            for b in range(batch_index, batch_index + skip_window * 2):
                con_vec = embeds[word2int[context[word_index]]]
                target_vec = embeds[word2int[target]]
                batch_inputs[b] = con_vec
                batch_labels[b] = target_vec
                word_index += 1

            ci += 1
        if len(data) - ci - skip_window < batch_size:
            ci = skip_window
        yield batch_inputs, batch_labels


In [56]:
def generate2(data, embeds, word2int, batch_size, skip_window):
    embed_size = embeds.shape[1]
    assert batch_size % skip_window == 0
    ci = skip_window  # current_index
    input_width = embed_size * 2 * skip_window
    while True:
        batch_inputs = np.ndarray(shape=(batch_size, input_width), dtype=np.float32)
        batch_labels = np.ndarray(shape=(batch_size, embed_size), dtype=np.float32)
        batch_index = 0
        for batch_index in range(batch_size):  # fill the batch inputs
            context = data[ci - skip_window:ci + skip_window + 1]
            target = context.pop(skip_window)
#             print(context, target)
            context_vec = []
            target_vec = embeds[word2int[target]]
            for word in context:
                con_vec = embeds[word2int[word]]
                context_vec.append(con_vec)
            context_vec = np.hstack(context_vec)
            batch_inputs[batch_index] = context_vec
            batch_labels[batch_index] = target_vec
            
            ci += 1
        if len(data) - ci - skip_window < batch_size:
            ci = skip_window
        yield batch_inputs, batch_labels


In [70]:
window = 3
semantic_batch_size = 120
gg =  generate2(words, embeds_norm, word2int, batch_size=semantic_batch_size, skip_window=window) 

In [71]:
g = generate(words, embeds_norm, word2int, batch_size=120, skip_window=5)

In [72]:
input_size = window * 2* embed_size
em_train, em_out = embedding_model(input_size, 128, embed_size)
adam = keras.optimizers.Nadam(lr=0.002)
em_train.compile(optimizer=adam, loss='mean_squared_error')
em_train.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
context_word (InputLayer)    (None, 768)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 256)               196864    
_________________________________________________________________
dense_25 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_26 (Dense)             (None, 128)               16512     
Total params: 246,272
Trainable params: 246,272
Non-trainable params: 0
_________________________________________________________________


In [73]:
n_batches = len(words) // 120
history = em_train.fit_generator(gg, steps_per_epoch=n_batches, epochs = 2)

Epoch 1/2
Epoch 2/2


In [89]:
# semantic = em_out.predict(embeds_norm)
# gensim = GensimWrapper(embed_size, 0, log=True)
# embeds = embeds.reshape((-1, 128))
# norms = np.linalg.norm(semantic, axis=1, keepdims=True)
# semantic_norm = semantic / norms
vecs = []
discovered = {}
for i in range(window, len(words) - window):
    context = words[i - window: i + window + 1]
    target = context.pop(window)
    if target not in discovered:
        discovered[target] = len(discovered)
        c_vec = []
        for cword in context:
            vec = embeds_norm[word2int[cword]]
            c_vec.append(vec)
        context_vec = np.hstack(c_vec)
        vecs.append(context_vec)
    if len(discovered) == len(vocab):
        print("discovered")
        break
    
semantic = np.stack(vecs).reshape(-1, input_size)
print(len(vecs), embeds_norm.shape)
assert semantic.shape[0] == embeds_norm.shape[0]
# semantic = em_out.predict(embeds_norm)
# gensim = GensimWrapper(embed_size, 0, log=True)
# embeds = embeds.reshape((-1, 128))
# norms = np.linalg.norm(semantic, axis=1, keepdims=True)
# semantic_norm = semantic / norms


discovered
297836 (297836, 128)


In [45]:
gensim.set_embeddings(word2int, semantic)
gensim.evaluate()

  if np.issubdtype(vec.dtype, np.int):
2018-10-24 12:24:49,931 : INFO : semantic word embedding: 0.0% (0/185)
2018-10-24 12:24:49,932 : INFO : total: 0.0% (0/185)
2018-10-24 12:24:50,134 : INFO : syntactic evaluation: 0.0% (0/132)
2018-10-24 12:24:50,135 : INFO : total: 0.0% (0/132)


22.01834862385321
