In [1]:
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import time

import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim import fully_connected as fc
#from sklearn.manifold import TSNE

from gensim.models import FastText

  from ._conv import register_converters as _register_converters


## Pickle Functions

In [3]:
def save_obj(obj, name):
    with open('obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def exist(name):
    return os.path.exists('obj/' + name + '.pkl')

## Embedding

In [None]:
corpus = load_obj('corpus')

In [None]:
bigram_mod = load_obj('bigram')
trigram_mod = load_obj('trigram')

In [2]:
embedding = FastText.load('embedding_reddit/embedding')

In [14]:
embedding.wv.similar_by_vector(embedding['murder'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('murder', 1.0),
 ('convict', 0.9244711399078369),
 ('death', 0.912652850151062),
 ('arrest', 0.9031184315681458),
 ('charge', 0.9015302062034607),
 ('killing', 0.8916978240013123),
 ('murderer', 0.8909902572631836),
 ('guilty', 0.888261079788208),
 ('kidnapping', 0.8872666358947754),
 ('tortured', 0.8868398070335388)]

## Data

In [5]:
train_data = embedding.wv.vectors

In [6]:
print(len(train_data))

29757


## Model

In [4]:
class mixmodel(object):
    def __init__(self, n_z=4, sigma=1e-3, lr=1e-3, beta=1):
        self.em_dim = 64
        self.n_z = n_z
        self.beta = beta
        self.sigma = sigma
    
        self.x = tf.placeholder(tf.float32, [None, self.em_dim])
        
        self.z_mu, self.z_log_sigma_sq = self.vae_encoder(self.x)
        eps = tf.random_normal(shape=tf.shape(self.z_log_sigma_sq),
                               mean=0, stddev=self.sigma, dtype=tf.float32)
        self.z = self.z_mu + tf.sqrt(tf.exp(self.z_log_sigma_sq)) * eps

        self.x_recon = self.vae_decoder(self.z)
        
        ## VAE Loss ##
        # Reconstruction loss
        self.l2_loss = tf.reduce_mean(tf.squared_difference(self.x, self.x_recon))
                
        # Latent loss : Kullback Leibler divergence
        latent_loss = -0.5 * tf.reduce_sum(
            1 + self.z_log_sigma_sq - tf.square(self.z_mu) - tf.exp(self.z_log_sigma_sq), axis=1)
        self.latent_loss = self.beta * tf.reduce_mean(latent_loss)
        
        self.loss = self.l2_loss + self.latent_loss
        #self.vae_train = tf.train.MomentumOptimizer(lr_2, 0.9, use_nesterov=True).minimize(self.vae_loss, var_list = vae_vars)
        self.train = tf.train.AdamOptimizer(learning_rate=lr).minimize(self.loss)
    
    def vae_encoder(self, x):
        f1 = fc(x, 128, scope='vae_enc_fc1', activation_fn=tf.nn.elu)
        #f1 = slim.batch_norm(f1, scope='vae_enc_fc1')
        f2 = fc(f1, 128, scope='vae_enc_fc2', activation_fn=tf.nn.elu)
        #f2 = slim.batch_norm(f2, scope='vae_enc_fc2')
        f3 = fc(f2, 128, scope='vae_enc_fc3', activation_fn=tf.nn.elu)
        #f3 = slim.batch_norm(f3, scope='vae_enc_fc3')
        f4 = fc(f3, 64, scope='vae_enc_fc4', activation_fn=tf.nn.elu)
        #f4 = slim.batch_norm(f4, scope='vae_enc_fc4')
        f5 = fc(f4, 64, scope='vae_enc_fc5', activation_fn=tf.nn.elu)
        #f5 = slim.batch_norm(f5, scope='vae_enc_fc5')
        
        z_mu = fc(f4, self.n_z, scope='vae_enc_fc11_mu', activation_fn=None)
        z_log_sigma_sq = fc(f4, self.n_z, scope='vae_enc_fc11_sigma', activation_fn=None)
        
        return z_mu, z_log_sigma_sq
    
    def vae_decoder(self, z):
        g6 = fc(z, 64, scope='vae_dec_fc6', activation_fn=tf.nn.elu)
        #g6 = slim.batch_norm(g6, scope='vae_dec_fc6')
        g7 = fc(g6, 64, scope='vae_dec_fc7', activation_fn=tf.nn.elu)
        #g7 = slim.batch_norm(g7, scope='vae_dec_fc7')
        g8 = fc(g7, 128, scope='vae_dec_fc8', activation_fn=tf.nn.elu)
        #g8 = slim.batch_norm(g8, scope='vae_dec_fc8')
        g9 = fc(g8, 128, scope='vae_dec_fc9', activation_fn=tf.nn.elu)
        #g9 = slim.batch_norm(g9, scope='vae_dec_fc9')
        g10 = fc(g9, 128, scope='vae_dec_fc10', activation_fn=tf.nn.elu)
        #g10 = slim.batch_norm(g10, scope='vae_dec_fc10')
        x_recon = fc(g10, self.em_dim, scope='vae_dec_fc11', activation_fn=None)
        
        return x_recon    

In [None]:
class simplemodel(object):
    def __init__(self, n_z=4, sigma=1e-3, lr=1e-3, beta=1):
        self.em_dim = 100
        self.n_z = n_z
        self.beta = beta
        self.sigma = sigma
    
        self.x = tf.placeholder(tf.float32, [None, self.em_dim])
        
        self.z_mu, self.z_log_sigma_sq = self.vae_encoder(self.x)
        eps = tf.random_normal(shape=tf.shape(self.z_log_sigma_sq),
                               mean=0, stddev=self.sigma, dtype=tf.float32)
        self.z = self.z_mu + tf.sqrt(tf.exp(self.z_log_sigma_sq)) * eps

        self.x_recon = self.vae_decoder(self.z)
        
        ## VAE Loss ##
        # Reconstruction loss
        self.l2_loss = tf.reduce_mean(tf.squared_difference(self.x, self.x_recon))
                
        # Latent loss : Kullback Leibler divergence
        latent_loss = -0.5 * tf.reduce_sum(
            1 + self.z_log_sigma_sq - tf.square(self.z_mu) - tf.exp(self.z_log_sigma_sq), axis=1)
        self.latent_loss = self.beta * tf.reduce_mean(latent_loss)
        
        self.loss = self.l2_loss + self.latent_loss
        #self.vae_train = tf.train.MomentumOptimizer(lr_2, 0.9, use_nesterov=True).minimize(self.vae_loss, var_list = vae_vars)
        self.train = tf.train.AdamOptimizer(learning_rate=lr).minimize(self.loss)
    
    def vae_encoder(self, x):
        f1 = fc(x, 256, scope='vae_enc_fc1', activation_fn=tf.nn.elu)
        #f1 = slim.batch_norm(f1, scope='vae_enc_fc1')
        f2 = fc(f1, 256, scope='vae_enc_fc2', activation_fn=tf.nn.elu)
        #f2 = slim.batch_norm(f2, scope='vae_enc_fc2')
        
        z_mu = fc(f2, self.n_z, scope='vae_enc_fc11_mu', activation_fn=None)
        z_log_sigma_sq = fc(f2, self.n_z, scope='vae_enc_fc11_sigma', activation_fn=None)
        
        return z_mu, z_log_sigma_sq
    
    def vae_decoder(self, z):
        g9 = fc(z, 256, scope='vae_dec_fc9', activation_fn=tf.nn.elu)
        #g9 = slim.batch_norm(g9, scope='vae_dec_fc9')
        g10 = fc(g9, 256, scope='vae_dec_fc10', activation_fn=tf.nn.elu)
        #g10 = slim.batch_norm(g10, scope='vae_dec_fc10')
        x_recon = fc(g10, 100, scope='vae_dec_fc11', activation_fn=None)
        
        return x_recon    

## Define Graph

In [12]:
tf.reset_default_graph()
#model = simplemodel(n_z=64, sigma=0, lr=1e-3, beta=1)
model = mixmodel(n_z=16, sigma=0, lr=1e-3, beta=1)

tfconfig = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess=tf.Session(config=tfconfig)
sess.run(tf.global_variables_initializer())

## Load Model

In [13]:
model_name = 'vae16_100'
saver = tf.train.Saver()
saver.restore(sess, './ckpt_w2v/%s.ckpt' %model_name)

INFO:tensorflow:Restoring parameters from ./ckpt_w2v/vae16_100.ckpt


## Training

In [19]:
bs = 100
num_steps = len(train_data)//bs
ne = 100
print_step = 400
save_step = 5

saver = tf.train.Saver(max_to_keep = 10)

for epoch in range(ne):
    print('Epoch: [%d/%d]' %(epoch+1, ne))
    count = 0 
    avg_loss = 0
    avg_l2_loss = 0
    avg_latent_loss = 0
    
    np.random.shuffle(train_data)
    for idx in range(num_steps):
        batch_x = train_data[idx*bs:(idx+1)*bs]
        loss, l2_loss, latent_loss, _ = sess.run([model.loss, \
                model.l2_loss, model.latent_loss, model.train], \
                feed_dict = {model.x: batch_x})

        count += 1        
        avg_loss += loss
        avg_l2_loss += l2_loss
        avg_latent_loss += latent_loss

    print('total loss: <%.4f>, l2: %.4f, latent: %.4f' \
         %(avg_loss/count, avg_l2_loss/count, avg_latent_loss/count))
    
    if (epoch+1)%save_step==0:
        saver.save(sess, './ckpt_w2v/vae32_%d.ckpt' %(epoch+1))

Epoch: [1/100]
total loss: <0.1500>, l2: 0.0919, latent: 0.0580
Epoch: [2/100]
total loss: <0.0714>, l2: 0.0637, latent: 0.0077
Epoch: [3/100]
total loss: <0.0612>, l2: 0.0558, latent: 0.0054
Epoch: [4/100]
total loss: <0.0549>, l2: 0.0507, latent: 0.0042
Epoch: [5/100]
total loss: <0.0510>, l2: 0.0475, latent: 0.0035
Epoch: [6/100]
total loss: <0.0478>, l2: 0.0449, latent: 0.0029
Epoch: [7/100]
total loss: <0.0462>, l2: 0.0439, latent: 0.0023
Epoch: [8/100]
total loss: <0.0440>, l2: 0.0420, latent: 0.0020
Epoch: [9/100]
total loss: <0.0425>, l2: 0.0408, latent: 0.0017
Epoch: [10/100]
total loss: <0.0411>, l2: 0.0397, latent: 0.0014
Epoch: [11/100]
total loss: <0.0406>, l2: 0.0394, latent: 0.0012
Epoch: [12/100]
total loss: <0.0400>, l2: 0.0390, latent: 0.0010
Epoch: [13/100]
total loss: <0.0396>, l2: 0.0388, latent: 0.0009
Epoch: [14/100]
total loss: <0.0394>, l2: 0.0386, latent: 0.0008
Epoch: [15/100]
total loss: <0.0392>, l2: 0.0384, latent: 0.0007
Epoch: [16/100]
total loss: <0.039

## Evaluation

In [15]:
word = 'president'
word_vec = embedding[word]
code, recon_vec = sess.run([model.z, model.x_recon], feed_dict={model.x: [word_vec]})
print(code[0])

similar = embedding.wv.similar_by_vector(recon_vec[0])
similar_words, similarities = zip(*similar)
print(list(similar_words))

[-0.00559954 -0.00951704  0.00137021  0.00206775  0.00935208 -0.00566384
  0.00708273  0.00113553 -0.01599122 -0.01370033  0.00104946  0.00606643
 -0.0087571  -0.00569791 -0.00328011 -0.00063683]
['president', 'leader', 'preside', 'biden', 'opponent', 'barack_obama', 'ouster', 'oust', 'dictatorship', 'successor']


  
  if np.issubdtype(vec.dtype, np.int):


In [9]:
def similar_by_distance(target_vec, num_out=10):
    distance = np.sum(np.power(embedding.wv.vectors - target_vec, 2), axis=1)
    idx = np.argpartition(distance, num_out)
    result = []
    for i in range(num_out):
        result.append(embedding.wv.index2word[idx[i]])
    return result

In [16]:
similar_by_distance(embedding['injure'])

  """Entry point for launching an IPython kernel.


['wound',
 'injured',
 'dozen',
 'dead',
 'least',
 'injure',
 'critically_injur',
 'wounded',
 'kill',
 'seriously_injur']

In [17]:
embedding.wv.similar_by_vector(embedding.wv['injure'])

  if np.issubdtype(vec.dtype, np.int):


[('injure', 1.0),
 ('injured', 0.9605617523193359),
 ('wound', 0.9437973499298096),
 ('wounded', 0.9277883172035217),
 ('least', 0.9168642163276672),
 ('dozen', 0.904802143573761),
 ('kill', 0.8982696533203125),
 ('dead', 0.8954891562461853),
 ('critically_injur', 0.8931788206100464),
 ('trains_collide', 0.8898150324821472)]

In [18]:
word = 'stock'
word_vec = embedding.wv[word]
code, recon_vec = sess.run([model.z, model.x_recon], feed_dict={model.x: [word_vec]})
#print(recon_vec[0])

similar = embedding.wv.similar_by_vector(word_vec)
similar_words, similarities = zip(*similar)
#print(list(similar_words))
print(similar)

similar = embedding.wv.similar_by_vector(recon_vec[0])
similar_words, similarities = zip(*similar)
print()
print(list(similar_words))
print()
print(similar_by_distance(recon_vec[0]))

[('stock', 1.0), ('investor', 0.7777920961380005), ('stock_market', 0.7727078199386597), ('appetite', 0.7531195878982544), ('trading', 0.74650639295578), ('pile', 0.7432770729064941), ('emerging_market', 0.7402875423431396), ('bond', 0.7398805618286133), ('commodity', 0.7372817397117615), ('profit', 0.7363636493682861)]

['stock', 'bond', 'dollar', 'price', 'buying', 'appetite', 'dip', 'profit', 'boon', 'cash']

['instance', 'profit', 'stock', 'appetite', 'trouble', 'dollar', 'dip', 'value', 'bond', 'buying']


  if np.issubdtype(vec.dtype, np.int):


## Distribution