In [None]:
import random
seed_val = 1000
random.seed(seed_val)
import numpy as np
np.random.seed(seed_val)
import tensorflow as tf
# tf.set_random_seed(seed_val)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM, TimeDistributed, SimpleRNN
from tensorflow.keras.layers import Concatenate, Flatten, Embedding
from tensorflow.keras.layers import GRU, Conv2D, MaxPooling2D, AveragePooling2D, AvgPool2D, MaxPool1D
from tensorflow.keras.layers import Input, Reshape, Dot, Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import regularizers
import tensorflow.keras as keras
import tensorflow.keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
from IPython.display import SVG

In [None]:
def conv_model_multi(n_chars, n_consonant, n_vowels, n_units, char_embed_size=32):
    cons_inputs = Input(shape=(n_chars, ), dtype="int32", name="cons_inputs")
    vow_inputs = Input(shape=(n_chars, ), dtype="int32", name="vow_inputs")
    
    cons_embed = Embedding(n_consonant,char_embed_size, input_length=n_chars)(cons_inputs)
    vow_embed = Embedding(n_vowels, char_embed_size, input_length=n_chars)(vow_inputs)
    
    x = Concatenate(axis=2)([cons_embed, vow_embed])
    
    x = Reshape([n_chars, char_embed_size*2, 1])(x)
    x = Conv2D(32, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D(3)(x)

    x = Flatten()(x)
    x = Dense(embed_size, activation="tanh", name="x")(x)
    
    H = Dense(embed_size, activation="tanh", name="H")(x)
    T = Dense(embed_size, activation="sigmoid", name="T")(x)
    
    C = 1 - T
    x = H * T + C * x
    state_h = Dense(embed_size, activation="tanh", name="state")(x)
    
    consonant_decoder_inputs = Input(shape=(None, n_consonant), name="target_consonant")
    consonant_decoder_gru = GRU(n_units, return_sequences=True, return_state=True,  name="consonant_decoder_gru")
    consonant_decoder_outputs, _= consonant_decoder_gru(consonant_decoder_inputs, initial_state=state_h)

    vowel_decoder_inputs = Input(shape=(None, n_vowels), name="vowel_input")
    vowel_decoder_gru = GRU(n_units, return_sequences=True, return_state=True, name="vowl_decoder_gru")
    vowel_decoder_outputs, _= vowel_decoder_gru(vowel_decoder_inputs, initial_state=state_h)

    consonant_decoder_dense = Dense(n_consonant, activation='softmax', name="consonant_output")
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_dense = Dense(n_vowels, activation='softmax', name="vowel_output")
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    main_model = Model([cons_inputs, vow_inputs, consonant_decoder_inputs, vowel_decoder_inputs], [consonant_decoder_outputs, vowel_decoder_outputs])
    encoder_model = Model([cons_inputs, vow_inputs], state_h)
    
    decoder_state_input_h = Input(shape=(n_units,))
    
    consonant_decoder_outputs, state_h= consonant_decoder_gru(consonant_decoder_inputs, initial_state=decoder_state_input_h)
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_outputs, state_h= vowel_decoder_gru(vowel_decoder_inputs, initial_state=decoder_state_input_h)
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    decoder_model = Model([consonant_decoder_inputs, vowel_decoder_inputs, decoder_state_input_h], [consonant_decoder_outputs, vowel_decoder_outputs, state_h])

    return main_model, encoder_model, decoder_model

def generator(word2int, int2word, char2tup,  batch_size, n_consonant, n_vowels):
    targets, target_inputs = {}, {}
    vocab = []
    for word in word2int.keys():
        target = word + '|'
        target_input = '&' + target 
        targets[word] = target
        target_inputs[word] = target_input
        vocab.append(word)
    
    batch = 0
    n_batchs = len(word2int) // batch_size
    n_chars = 13
    current_word = 0
    while True:
        batch_cons_input = []
        batch_vow_input = []
        batch_decoder_cons_input = []
        batch_decoder_vow_input = []
        batch_output_cons = []
        batch_output_vow = []
        for bi in range(0, batch_size):
            input_word = vocab[current_word]

            input_con, input_vow = word2vec_indexed(char2tup, input_word, n_chars, n_consonant, n_vowels)
            target_con, target_vow = word2vec_seperated(char2tup, targets[input_word], n_chars, n_consonant, n_vowels)
            decoder_con, decoder_vow = word2vec_seperated(char2tup,target_inputs[input_word], n_chars, n_consonant, n_vowels)

            batch_cons_input.append(input_con)
            batch_vow_input.append(input_vow)
            batch_decoder_cons_input.append(decoder_con)
            batch_decoder_vow_input.append(decoder_vow)
            batch_output_cons.append(target_con)
            batch_output_vow.append(target_vow)

            current_word += 1
            if current_word > batch_size * n_batchs:
                current_word = 0
                
        batch_cons_input = np.array(batch_cons_input)
        batch_vow_input = np.array(batch_vow_input)
        batch_decoder_cons_input = np.array(batch_decoder_cons_input)
        batch_decoder_vow_input = np.array(batch_decoder_vow_input)
        batch_output_cons = np.array(batch_output_cons)
        batch_output_vow = np.array(batch_output_vow)

        yield [batch_cons_input, batch_vow_input, batch_decoder_cons_input, batch_decoder_vow_input], [batch_output_cons, batch_output_vow]

def pred_embeddings(vocab, encoder, char2tup):
    embeddings = np.ndarray((len(vocab), embed_size))
    i = 0
    cons_buffer = []
    vow_buffer = []
    buffer_size = 10000
    for wi, word in enumerate(vocab):
        word = int2word[word2int[word]]
        convec, vowvec = word2vec_indexed(char2tup, word, n_chars, n_consonant, n_vowel)
        vow_buffer.append(vowvec)
        cons_buffer.append(convec)
        if len(vow_buffer) == buffer_size or len(vocab) - wi < buffer_size:
            vow_buffer_np = np.stack(vow_buffer)
            cons_buffer_np = np.stack(cons_buffer)
            result = encoder.predict([vow_buffer_np, cons_buffer_np])
            embeddings[i:i+len(vow_buffer)] = result
            i += len(vow_buffer)
            cons_buffer = []
            vow_buffer = []
            if i % (4 * buffer_size) == 0:
                print("Predicting: {0:.2f}%".format((i * 100.0 / len(vocab))))
                
    print("finished")
    return embeddings

In [None]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
del words
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()
n_chars = 11 + 2 
n_features = len(char2int)

batch_size = 100
embed_size = 100

n_batches = len(vocab)  // batch_size


In [None]:
model, encoder, decoder = conv_model_multi(n_chars, n_consonant, n_vowel, embed_size)
adam = keras.optimizers.Adam(.001)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['acc'])
gen = generator(word2int, int2word, char2tup, batch_size, n_consonant, n_vowel)
encoder.summary()

In [None]:
history = model.fit_generator(gen, steps_per_epoch=n_batches, epochs = 15)

In [None]:
embeddings = pred_embeddings(vocab, encoder, char2tup)

In [None]:
# print(evaluate(word2int, embeddings) )
print(evaluate(word2int, normalize(embeddings)))

In [None]:
# file = open("results/embed_embed10.txt", encoding='utf8', mode='w')
# file.write("{0} {1}\n".format(len(vocab), embed_size))
# for word, index in word2int.items():
#     e = embeddings[index]
#     e = ' '.join(map(lambda x: str(x), e))
#     file.write("{0} {1}\n".format(word, e))
# file.close()