In [1]:
import random
seed_val = 1000
random.seed(seed_val)
import numpy as np
np.random.seed(seed_val)
import tensorflow as tf
# tf.set_random_seed(seed_val)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM, TimeDistributed, SimpleRNN, ReLU, BatchNormalization
from tensorflow.keras.layers import Concatenate, Flatten, Embedding
from tensorflow.keras.layers import GRU, Conv2D, MaxPooling2D, AveragePooling2D, AvgPool2D, MaxPool1D, TimeDistributed
from tensorflow.keras.layers import Input, Reshape, Dot, Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import regularizers
# from tensorflow.keras.utils.vis_utils import plot_model
import tensorflow.keras as keras
import tensorflow.keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Using TensorFlow backend.


In [2]:
def conv_model_multi(n_chars, n_features, n_units, char_emb_size=32):
    root_word_input = Input(shape=(n_chars,), name="word_input")
    embedding = Embedding(input_dim=n_features, input_length=n_chars, output_dim=char_emb_size)(root_word_input)
    x = Reshape([n_chars, char_emb_size, 1])(embedding)
    
    x = Conv2D(32, (3, 3), padding='same', activation='linear')(x)
    x = MaxPooling2D(2, 2)(x)
    x = ReLU()(x)
    x = BatchNormalization()(x)
    
#     x = Conv2D(16, (3, 3), padding='same', activation='linear')(x)
#     x = MaxPooling2D(2, 2)(x)
#     x = ReLU()(x)
#     x = BatchNormalization()(x)

    x = Flatten()(x)
    print(x)
    state_h = Dense(n_units, activation='linear', name="state_h")(x)

    
    consonant_decoder_inputs = Input(shape=(None, n_features), name="target_consonant")
    consonant_decoder_gru = GRU(n_units, return_sequences=True, return_state=True,  name="consonant_decoder_gru")
    consonant_decoder_outputs, _= consonant_decoder_gru(consonant_decoder_inputs, initial_state=state_h)

#     vowel_decoder_inputs = Input(shape=(None, n_vowels), name="vowel_input")
#     vowel_decoder_gru = GRU(n_units, return_sequences=True, return_state=True, name="vowl_decoder_gru")
#     vowel_decoder_outputs, _= vowel_decoder_gru(vowel_decoder_inputs, initial_state=state_h)

    consonant_decoder_dense = Dense(n_features, activation='softmax', name="consonant_output")
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
#     vowel_decoder_dense = Dense(n_vowels, activation='softmax', name="vowel_output")
#     vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    main_model = Model([root_word_input, consonant_decoder_inputs], consonant_decoder_outputs)
    
    encoder_model = Model(root_word_input, state_h)
    
    decoder_state_input_h = Input(shape=(n_units,))
    
    consonant_decoder_outputs, state_h= consonant_decoder_gru(consonant_decoder_inputs, initial_state=decoder_state_input_h)
    consonant_decoder_outputs = consonant_decoder_dense(consonant_decoder_outputs)
    
#     vowel_decoder_outputs, state_h= vowel_decoder_gru(vowel_decoder_inputs, initial_state=decoder_state_input_h)
#     vowel_decoder_outputs = vowel_decoder_dense(vowel_decoder_outputs)
    
    decoder_model = Model([consonant_decoder_inputs, decoder_state_input_h], [consonant_decoder_outputs, state_h])

    return main_model, encoder_model, decoder_model


In [3]:
def generate(word2int, batch_size):
    words = list(word2int.keys())
    targets, target_inputs = {}, {}
    for word in words:
        target = word + '|'
        target_input = '&' + target 
        targets[word] = target
        target_inputs[word] = target_input
        
    current = 0
    indexes = np.arange(len(targets))
    n_batchs = len(words) // batch_size
    n_chars = 13
    
    while True:
        batch_input = []
        batch_decoder_cons_input = []
        batch_output_cons = []
        
        for bi in range(0, batch_size):
            input_word = words[indexes[current]]
            input_map = word2vec_sparse(char2int, input_word, n_chars)
#             print(input_map)
            target_map = word2vec(char2tup, targets[input_word], n_chars)
            decoder_map = word2vec(char2tup, target_inputs[input_word], n_chars)

            batch_input.append(input_map)
            batch_decoder_cons_input.append(decoder_map)
            batch_output_cons.append(target_map)
            
            current += 1
        if current + batch_size > len(vocab):
            current = 0
            random.shuffle(indexes)
            
        batch_input = np.array(batch_input)
        batch_decoder_cons_input = np.array(batch_decoder_cons_input)
        batch_output_cons = np.array(batch_output_cons)

        yield [batch_input, batch_decoder_cons_input], batch_output_cons


In [4]:
words = read_file()
vocab, word2int, int2word = build_vocab(words)
word2freq = get_frequency(words, word2int, int2word)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()

In [5]:
n_chars = 11 + 2 
n_features = len(char2int)
batch_size = 300
embed_size = 100
n_batches = len(vocab)  // batch_size

In [6]:
try:
    del multi_train
    del multi_enc
    del multi_dec
    keras.backend.clear_session()
    gc.collect()
except:
    pass
multi_train, multi_enc, multi_dec = conv_model_multi(n_chars, len(char2int), n_units=embed_size)
adam = keras.optimizers.Adam(.001)
multi_train.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['acc'])
multi_gen = generate(word2int,  batch_size)
# words, char2int, char2tup, batch_size, n_consonant, n_vowels
# plot_model(multi_train, show_shapes=True, show_layer_names=True)
# multi_train.summary()

Tensor("flatten/Reshape:0", shape=(None, 3072), dtype=float32)


In [7]:
# gensg = generate( word2int,  batch_size)
# [x1, x2], y = next(gensg)

In [8]:
# print(x1[0])

In [9]:
# SVG(model_to_dot(multi_train, show_shapes=True).create(prog='dot', format='svg'))

In [10]:
history = multi_train.fit_generator(multi_gen, steps_per_epoch=n_batches, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
def pred_embeddings(vocab, encoder, char2int):
    embeddings = np.ndarray((len(vocab), embed_size))
    i = 0
    buffer = []
    buffer_size = 10000
    for wi, word in enumerate(vocab):
        word = int2word[word2int[word]]
        mat = word2vec_sparse(char2int, word, n_chars)
#         mat = convec.reshape((-1, n_chars, len(char2int), 1))
#         mat = np.concatenate([convec, vowvec], axis=2)
        buffer.append(mat)
        if len(buffer) == buffer_size or len(vocab) - wi < buffer_size:
            buffer_np = np.stack(buffer)#.reshape((-1, 13, len(char2int)))
            result = encoder.predict(buffer_np)
            embeddings[i:i+len(buffer)] = result
            i += len(buffer)
            buffer = []
            if i % (4 * buffer_size) == 0:
                print("Predicting: {0:.2f}%".format((i * 100.0 / len(vocab))))
                
    print("finished")
    return embeddings

In [12]:
# del int2word[0]
# i = vocab.index('<unk>')
# del vocab[i]

embeddings = pred_embeddings(vocab, multi_enc, char2int)

Predicting: 13.43%
Predicting: 26.86%
Predicting: 40.29%
Predicting: 53.72%
Predicting: 67.15%
Predicting: 80.58%
Predicting: 94.01%
finished


In [13]:
evaluate(word2int, embeddings) 
evaluate(word2int, normalize(embeddings))

W0630 23:55:42.284978 11376 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
W0630 23:55:42.564231 11376 smart_open_lib.py:379] this function is deprecated, use smart_open.open instead
W0630 23:55:55.093498 11376 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
W0630 23:55:55.388223 11376 smart_open_lib.py:379] this function is deprecated, use smart_open.open instead


{'syntactic': 2.5700934579439254, 'semantic': 0.0, 'total': 1.5736766809728182}

In [14]:
# file = open("results/text.txt", encoding='utf8', mode='w')
# file.write("{0} {1}\n".format(len(vocab), embed_size))
# for word, index in word2int.items():
#     e = embeddings[index]
#     e = ' '.join(map(lambda x: str(x), e))
#     file.write("{0} {1}\n".format(word, e))
# file.close()