In [1]:
import random
seed_val = 1000
random.seed(seed_val)
import numpy as np
np.random.seed(seed_val)
import tensorflow as tf
tf.set_random_seed(seed_val)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, TimeDistributed
from keras.layers import Concatenate, Flatten, Lambda
from keras.layers import GRU, Conv2D, MaxPooling2D
from keras.layers import Input, Reshape, Dot, Add
from keras.models import Model
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras import regularizers
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint
import keras
import keras.backend as K
from data_handle import *
from gensim_wrapper import *
from utils import *
import gensim
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def my_loss(y_true, y_pred):
    return y_pred

def neg_sample_loss(x):
    posZ, negZ = x
    posZ = K.log(K.sigmoid(posZ))
    negZ = K.log(K.sigmoid(-negZ))
    
    loss = -posZ - negZ
    return loss

def conv_model_multi(n_chars, n_consonant, n_vowels, n_units):
    target_word = Input(shape=(n_chars, (n_consonant + n_vowels), 1), name="target_word")
    context_word = Input(shape=(n_chars, (n_consonant + n_vowels), 1), name="context_word")
    negative_word = Input(shape=(n_chars, (n_consonant + n_vowels), 1), name="negative_word")
    
    input_word = Input(shape=(n_chars, (n_consonant + n_vowels), 1), name="input_word")
    t = Conv2D(10, (5, 5), padding='same', activation='relu')(input_word)
    t = MaxPooling2D(3, 3)(t)
    t = Flatten()(t)
    t = Dense(n_units, activation='linear')(t)
    target_model = Model(input_word, t, name="target_model")
    
    out_word = Input(shape=(n_chars, (n_consonant + n_vowels), 1), name="out_word")
    c = Conv2D(10, (5, 5), padding='same', activation='relu')(out_word)
    c = MaxPooling2D(3, 3)(c)
    c = Flatten()(c)
    c = Dense(n_units, activation='linear')(c)
    context_model = Model(out_word, c, name="context_model")
    
    target = target_model(target_word)
    context = context_model(context_word)
    negative = context_model(negative_word)
    posZ = Dot(axes=1, name="tar_dot_con")([target, context])
    negZ = Dot(axes=1, name="tar_dot_neg")([target, negative])
    
    loss_layer = Lambda(neg_sample_loss, name="neg_loss")([posZ, negZ])
    
    model = Model([target_word, context_word, negative_word], loss_layer)
    return model, target_model

def embedding_pred(vocab, encoder, char2tup):
    embeddings = np.ndarray((len(vocab), embed_size))
    i = 0
    buffer = []
    buffer_size = 10000
    for wi, word in enumerate(vocab):
        word = int2word[word2int[word]]
        convec, vowvec = word2vec_seperated(char2tup, word, n_chars, n_consonant, n_vowel)
        convec = convec.reshape((-1, n_chars, n_consonant, 1))
        vowvec = vowvec.reshape((-1, n_chars, n_vowel, 1))
        mat = np.concatenate([convec, vowvec], axis=2)
        buffer.append(mat)
        if len(buffer) == buffer_size or len(vocab) - wi < buffer_size:
            buffer_np = np.stack(buffer).reshape((-1, 13, 50, 1))
            result = encoder.predict(buffer_np)
            embeddings[i:i+len(buffer)] = result
            i += len(buffer)
            buffer = []
            if i % (4 * buffer_size) == 0:
                print("Predicting: {0:.2f}%".format((i * 100.0 / len(vocab))))
                
    print("finished")
    return embeddings

In [3]:

words = read_file()
words, word2freq = min_count_threshold(words)
# words = subsampling(words, 1e-3)
vocab, word2int, int2word = build_vocab(words)
char2int, int2char, char2tup, tup2char, n_consonant, n_vowel = build_charset()
print("Words to train: ", len(words))
print("Vocabs to train: ", len(vocab))
# print("Unk count: ", word2freq['<unk>'])
int_words = words_to_ints(word2int, words)
int_words = np.array(int_words, dtype=np.int32)
n_chars = 11 + 2
batch_size = 100
embed_size = 75
skip_window = 2
n_features = len(char2int)
ns_unigrams = list(np.array(
    ns_sample(word2freq, word2int, int2word, .75), dtype=np.int32))

n_batches = len(int_words) * skip_window * 2  // batch_size


Words to train:  3730679
Vocabs to train:  58383


In [4]:
try:
    del train_model
    keras.backend.clear_session()
    gc.collect()
except:
    pass
train_model, target_model = conv_model_multi(n_chars, n_consonant, n_vowel, embed_size)
adam = keras.optimizers.Nadam(.0005)
train_model.compile(optimizer=adam, loss=my_loss)
gen = gen_imag_neg(list(int_words), skip_window, batch_size,
               int2word, char2tup,ns_unigrams, n_chars, n_consonant, n_vowel)
# train_model.summary()

In [5]:
filepath = "em/weight-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1)
callbacks = [checkpoint]
history = train_model.fit_generator(gen, steps_per_epoch=n_batches, epochs = 10, callbacks=callbacks)

Epoch 1/10

Epoch 00001: saving model to em/weight-01-1.3053.hdf5
Epoch 2/10

Epoch 00002: saving model to em/weight-02-1.2948.hdf5
Epoch 3/10

Epoch 00003: saving model to em/weight-03-1.2921.hdf5
Epoch 4/10

Epoch 00004: saving model to em/weight-04-1.2906.hdf5
Epoch 5/10

Epoch 00005: saving model to em/weight-05-1.2899.hdf5
Epoch 6/10

KeyboardInterrupt: 

In [9]:
embeddings = embedding_pred(vocab, target_model, char2tup)

Predicting: 68.51%
finished


In [8]:
evaluate(word2int, embeddings, embed_size=75) 
evaluate(word2int, normalize(embeddings), embed_size=75)

  if np.issubdtype(vec.dtype, np.int):


{'syntactic': 0.7009345794392523,
 'semantic': 0.7380073800738007,
 'total': 0.7153075822603719,
 'pick-one-out': 59.63302752293578}