In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten
from keras.layers import GRU, Conv2D, MaxPooling2D, Embedding
from keras.layers import Input, Reshape, Dot, Add
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
# from keras.utils.vis_utils import plot_model
import keras
import keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
import random
import numpy as np
import tensorflow as tf
seed_val = 1000
random.seed(seed_val)
np.random.seed(seed_val)
tf.set_random_seed(seed_val)

In [3]:
def conv_model_multi(vocab_size, n_seq, embed_size, n_consonant, n_vowels, n_units):
    root_word_input = Input(shape=(seq_len,), dtype='int32', name="root_word_input")
    
    x = Embedding(vocab_size, embed_size, input_length=n_seq)(root_word_input)
    _, state_h = GRU(n_units, return_sequences=True, return_state=True, activation='relu')(x)
    
    consonant_decoder_inputs = Input(shape=(None, n_consonant), name="target_consonant")
    consonant_decoder_gru = GRU(n_units, return_sequences=True, return_state=True,  name="consonant_decoder_gru")
    consonant_decoder_outputs, _= consonant_decoder_gru(consonant_decoder_inputs, initial_state=state_h)
    
    vowel_decoder_inputs = Input(shape=(None, n_vowels), name="vowel_input")
    vowel_decoder_gru = GRU(n_units, return_sequences=True, return_state=True, name="vowl_decoder_gru")
    vowel_decoder_outputs, _= vowel_decoder_gru(vowel_decoder_inputs, initial_state=state_h)

    consonant_decoder_dense = Dense(n_consonant, activation='softmax', name="consonant_output")
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_dense = Dense(n_vowels, activation='softmax', name="vowel_output")
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    main_model = Model([root_word_input, consonant_decoder_inputs, vowel_decoder_inputs], [consonant_decoder_outputs, vowel_decoder_outputs])
    
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_units,))
    
    consonant_decoder_outputs, state_h= consonant_decoder_gru(consonant_decoder_inputs, initial_state=decoder_state_input_h)
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
    vowel_decoder_outputs, state_h= vowel_decoder_gru(vowel_decoder_inputs, initial_state=decoder_state_input_h)
    vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    decoder_model = Model([consonant_decoder_inputs, vowel_decoder_inputs, decoder_state_input_h], [consonant_decoder_outputs, vowel_decoder_outputs, state_h])

    return main_model , encoder_model, decoder_model

In [4]:
def generateSG(data, word2int, char2tup, skip_window, batch_size, n_consonant, n_vowels):
    win_size = skip_window  
    i = win_size
    assert batch_size % (win_size + 1) == 0
    targets, target_inputs = {}, {}
    for word in word2int.keys():
        target = word + '|'
        target_input = '&' + target + '|'
        targets[word] = target
        target_inputs[word] = target_inputs
        
    batch = 0
    n_batchs = len(words) // batch_size
    n_chars = 13
    while True:
        batch_input = []
        batch_decoder_cons_input = []
        batch_decoder_vow_input = []
        batch_output_cons = []
        batch_output_vow = []
        for bi in range(0, batch_size, skip_window * 2 + 1):
            context = data[i - win_size: i + win_size + 1]
            target = [context[win_size]] * (win_size * 2 + 1)
            
            for x, y in zip(contex, target):
                input_word = int2word[x]
                output_word = int2word[y]
                input_con, input_vow = word2vec_seperated(char2tup, input_word, n_chars, n_consonant, n_vowels)
                target_con, target_vow = word2vec_seperated(char2tup,targets[output_word], n_chars, n_consonant, n_vowels)
                decoder_con, decoder_vow = word2vec_seperated(char2tup,target_inputs[output_word], n_chars, n_consonant, n_vowels)
                input_vec = np.concatenate([input_con, input_vow], axis=1).reshape((n_chars, (n_consonant + n_vowels), 1))
                
                batch_input.append(input_vec)
                batch_decoder_cons_input.append(decoder_con)
                batch_decoder_vow_input.append(decoder_vow)
                batch_output_cons.append(target_con)
                batch_output_vow.append(target_vow)

            i += 1
            if i + win_size + 1 > len(data):
                i = win_size
        yield [batch_input, batch_decoder_cons_input, batch_decoder_vow_input], [batch_output_cons, batch_output_vow]


In [None]:
words = read_file(filename='data/news.txt')
vocab, word2int, int2word = build_vocab(words)
word2freq = get_frequency(words, word2int, int2word)

int_words = words_to_ints(word2int, words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()
ns_unigrams = ns_sample(word2freq, word2int, int2word, .75)
n_chars = 11 + 2 
n_features = len(char2int)
batch_size = 500
embed_size = 100
skip_window = 2
seq_len = 5
rnn_n_units = 64
vocab_size = len(vocab)
n_batches = len(words) // batch_size

In [None]:
main_model, encoder_model, decoder_model = conv_model_multi(vocab_size, seq_len, embed_size, n_consonant, n_vowel, rnn_n_units)
adam = keras.optimizers.Nadam(0.001)
main_model.compile(optimizer=adam, loss="categorical_crossentropy", metrics=['acc'])
# main_model.summary()

In [None]:
gen = generate_for_char_langauge(words, int_words, int2word, char2tup, batch_size=batch_size, n_chars=13, n_consonant=n_consonant, n_vowels=10, seq_length=seq_len)

In [None]:

history = main_model.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)