In [1]:
import random
seed_val = 1000
random.seed(seed_val)
import numpy as np
np.random.seed(seed_val)
import tensorflow as tf
tf.set_random_seed(seed_val)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D, AveragePooling2D, AvgPool2D, MaxPool1D
from keras.layers import Input, Reshape, Dot, Add
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras import regularizers
from keras.utils.vis_utils import plot_model
import keras
import keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def conv_model_multi(n_chars, n_consonant, n_vowels, n_units):
    root_word_input = Input(shape=(n_chars, n_consonant + n_vowels, 1), name="root_word_input")
    
    x = Conv2D(10, (5, 5), padding='same', activation='relu')(root_word_input)
    x = MaxPool1D(3, 3)(x)
#     x = Conv2D(8, (3, 3), padding='same', activation='relu')(x)
#     x = AveragePooling2D(3, 3)(x)
    
    x = Flatten()(x)
#     x = MaxPool1D(3)(x)
    state_h = Dense(n_units, activation='linear')(x)
    
    consonant_decoder_inputs = Input(shape=(None, n_consonant), name="target_consonant")
    consonant_decoder_gru = GRU(n_units, return_sequences=True, return_state=True,  name="consonant_decoder_gru")
    consonant_decoder_outputs, _= consonant_decoder_gru(consonant_decoder_inputs, initial_state=state_h)
    
    vowel_decoder_inputs = Input(shape=(None, n_vowels), name="vowel_input")
    vowel_decoder_gru = GRU(n_units, return_sequences=True, return_state=True, name="vowl_decoder_gru")
    vowel_decoder_outputs, _= vowel_decoder_gru(vowel_decoder_inputs, initial_state=state_h)

    consonant_decoder_dense = Dense(n_consonant, activation='softmax', name="consonant_output")
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_dense = Dense(n_vowels, activation='softmax', name="vowel_output")
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    main_model = Model([root_word_input, consonant_decoder_inputs, vowel_decoder_inputs], [consonant_decoder_outputs, vowel_decoder_outputs])
    
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_units,))
    
    consonant_decoder_outputs, state_h= consonant_decoder_gru(consonant_decoder_inputs, initial_state=decoder_state_input_h)
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_outputs, state_h= vowel_decoder_gru(vowel_decoder_inputs, initial_state=decoder_state_input_h)
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    decoder_model = Model([consonant_decoder_inputs, vowel_decoder_inputs, decoder_state_input_h], [consonant_decoder_outputs, vowel_decoder_outputs, state_h])

    return main_model, encoder_model, decoder_model

def one_hot_sep(con, con_max, vow, vow_max):
    con_vec = np.zeros((con_max, ))
    con_vec[con] = 1
    vow_vec = np.zeros((vow_max, ))
    vow_vec[vow] = 1
    return con_vec, vow_vec
    
def decode_multi_sequence(model, char2tup, tup2char, state, n_consonant, n_vowels):
    con, vow = char2tup['&']
    con_vec, vow_vec = one_hot_sep(con, n_consonant, vow, n_vowels) 
    con_vec = con_vec.reshape((1, 1, -1))
    vow_vec = vow_vec.reshape((1, 1, -1))
#     target_seq = np.concatenate([con_vec, vow_vec])
    decoded_chars = []
    for i in range(13):
        con_vec, vow_vec, state = model.predict([con_vec, vow_vec, state])
#         target_seq = np.concatenate([con_vec, vow_vec])
        new_con_vec = np.zeros_like(con_vec)
        new_con_vec[0, 0, np.argmax(con_vec[0, 0, :])] = 1
        new_vow_vec = np.zeros_like(vow_vec)
        new_vow_vec[0, 0, np.argmax(vow_vec[0, 0, :])] = 1
        con_vec, vow_vec = new_con_vec, new_vow_vec
        name = "{0}-{1}".format(np.argmax(con_vec[0, 0, :]), np.argmax(vow_vec[0, 0, :]))
        try:
            char = tup2char[name]
        except:
            char = ' '
        decoded_chars += [char]
    
    return decoded_chars

def pred_embeddings_multi(vocab, encoder, char2tup):
    embeddings = np.ndarray((len(vocab), embed_size))
    i = 0
    buffer = []
    buffer_size = 10000
    for wi, word in enumerate(vocab):
        word = int2word[word2int[word]]
        convec, vowvec = word2vec_seperated(char2tup, word, n_chars, n_consonant, n_vowel)
        convec = convec.reshape((-1, n_chars, n_consonant, 1))
        vowvec = vowvec.reshape((-1, n_chars, n_vowel, 1))
        mat = np.concatenate([convec, vowvec], axis=2)
        buffer.append(mat)
        if len(buffer) == buffer_size or len(vocab) - wi < buffer_size:
            buffer_np = np.stack(buffer).reshape((-1, 13, 50, 1))
            result = encoder.predict(buffer_np)
            embeddings[i:i+len(buffer)] = result
            i += len(buffer)
            buffer = []
            if i % (4 * buffer_size) == 0:
                print("Predicting: {0:.2f}%".format((i * 100.0 / len(vocab))))
                
    print("finished")
    return embeddings

def generateSG(data, word2int,int2word, char2tup, skip_window, batch_size, n_consonant, n_vowels):
    win_size = skip_window  
    i = win_size
    assert batch_size % (win_size) == 0
    targets, target_inputs = {}, {}
    for word in word2int.keys():
        target = word + '|'
        target_input = '&' + target 
        targets[word] = target
        target_inputs[word] = target_input
        
    batch = 0
    n_batchs = len(words) // batch_size
    n_chars = 13
    while True:
        batch_input = []
        batch_decoder_cons_input = []
        batch_decoder_vow_input = []
        batch_output_cons = []
        batch_output_vow = []
        for bi in range(0, batch_size, skip_window * 2 + 1):
            context = data[i - win_size: i + win_size + 1]
            target = [context.pop(win_size)] * (win_size * 2)
#             print(context, target)
            for input_word, output_word in zip(context, target):
#                 print(target_inputs[output_word])
                input_con, input_vow = word2vec_seperated(char2tup, input_word, n_chars, n_consonant, n_vowels)
                target_con, target_vow = word2vec_seperated(char2tup,targets[output_word], n_chars, n_consonant, n_vowels)
                decoder_con, decoder_vow = word2vec_seperated(char2tup,target_inputs[output_word], n_chars, n_consonant, n_vowels)
                input_vec = np.concatenate([input_con, input_vow], axis=1).reshape((n_chars, (n_consonant + n_vowels), 1))
                
                batch_input.append(input_vec)
                batch_decoder_cons_input.append(decoder_con)
                batch_decoder_vow_input.append(decoder_vow)
                batch_output_cons.append(target_con)
                batch_output_vow.append(target_vow)

            i += 1
            if i + win_size + 1 > len(data):
                i = win_size
        batch_input = np.array(batch_input)
        batch_decoder_cons_input = np.array(batch_decoder_cons_input)
        batch_decoder_vow_input = np.array(batch_decoder_vow_input)
        batch_output_cons = np.array(batch_output_cons)
        batch_output_vow = np.array(batch_output_vow)
#         print(batch_input.shape)
        yield [batch_input, batch_decoder_cons_input, batch_decoder_vow_input], [batch_output_cons, batch_output_vow]


In [3]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
int_words = words_to_ints(word2int, words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()
ns_unigrams = ns_sample(word2freq, word2int, int2word, .75)
n_chars = 11 + 2 
n_features = len(char2int)
batch_size = 300
embed_size = 100
skip_window = 1

# words = read_file()
# unkown_word = "<unk>"
# # words = [unkown_word] + words
# xvocab, xword2int, xint2word = build_vocab(words)

# words, word2freq = min_count_threshold(words)

# vocab, word2int, int2word = build_vocab(words)
# print(len(vocab))
# # word2freq = get_frequency(words, word2int, int2word)
# unigrams = [word2freq[int2word[i]] for i in range(len(word2int))]

# char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()
# ns_unigrams = ns_sample(word2freq, word2int, int2word, .75)
# del words[0]
# del word2int['<unk>']
# n_chars = 11 + 2 
# n_features = len(char2int)
# batch_size = 500
# embed_size = 100
# skip_window = 1

n_batches = len(vocab)  // batch_size


In [4]:
# gensg = generateSG(words, word2int, int2word, char2tup, skip_window, batch_size, n_consonant, n_vowel)
# [x1, x2, x3], [y1, y2] = next(gensg)

In [5]:
try:
    del multi_train
    del multi_enc
    del multi_dec
    keras.backend.clear_session()
    gc.collect()
except:
    pass
multi_train, multi_enc, multi_dec = conv_model_multi(n_chars, n_consonant, n_vowel, embed_size)
adam = keras.optimizers.Nadam(.001)
multi_train.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['acc'])
multi_gen = generate_word_images_multi(vocab, char2tup, batch_size, n_consonant, n_vowel)
# words, char2int, char2tup, batch_size, n_consonant, n_vowels
# plot_model(multi_train)
# multi_train.summary()

ValueError: Input 0 is incompatible with layer max_pooling1d_1: expected ndim=3, found ndim=4

In [None]:
# SVG(model_to_dot(multi_train, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
history = multi_train.fit_generator(multi_gen, steps_per_epoch=n_batches, epochs = 3)

In [None]:
# del int2word[0]
# i = vocab.index('<unk>')
# del vocab[i]

embeddings = pred_embeddings_multi(vocab, multi_enc, char2tup)

In [None]:
# evaluate(embed_size=cew.shape[1],final_embedding=normalize(cew), word2int=word2int) 
# evaluate(embed_size=cew.shape[1],final_embedding=cew, word2int=word2int)

In [None]:
np.save("results/seq_k", embeddings)
# embeddings = np.vstack([np.zeros((1, embed_size)), embeddings])
# print(embeddings.shape, len(vocab))
# file = open("results/c_v.txt", encoding='utf8', mode='w')
# file.write("{0} {1}\n".format(len(vocab), embed_size))
# for word, index in word2int.items():
#     e = embeddings[index]
#     e = ' '.join(map(lambda x: str(x), e))
#     file.write("{0} {1}\n".format(word, e))
# file.close()
# print(evaluate(word2int, normalize(embeddings)))
