In [16]:
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import time

import tensorflow as tf
from tensorflow.contrib.slim import fully_connected as fc
#from sklearn.manifold import TSNE

from ThemeSeacher import Updator, clean, PhraserModel, tokenizer
from ThemeSeacher import KeywordDict, Model, Extractor
from ThemeSeacher import EmbeddingModel, Clustering

In [2]:
phraser = PhraserModel().get_phraser
dic = KeywordDict(phraser=phraser).get_dict

phraser_default.bin  loaded
keyword dictionary loaded


# Pickle Functions

In [3]:
def save_obj(obj, name):
    with open('obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def exist(name):
    return os.path.exists('obj/' + name + '.pkl')


# Embedding

In [4]:
embedding = EmbeddingModel().get_embedding

Embedding default loaded


In [5]:
total_data = load_obj('upper1')

In [6]:
total_data.lemmatized[0:5]

1     [president, obama, want, give, young, leader, ...
3             [japan, ask, condemn, korea, rights_abus]
11    [ukraine, blame, russia, fatal, ambush, husban...
13    [kim_jong_nam, murder, china, break, north, ko...
16    [finland, take, phone, call, find, much, neigh...
Name: lemmatized, dtype: object

In [105]:
total_data = total_data.reset_index(drop=True)

In [107]:
train_data = []
for idx, row in total_data.iterrows():
    title = list(row.lemmatized)
    title = [x for x in title if len(x)!=1]
        
    train_data.append(embedding[title])
    if (idx+1)%100000==0:
        print(idx+1)

  


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000


In [113]:
save_obj(train_data, 'seq2seq_embed')

In [7]:
train_data = load_obj('seq2seq_embed')

In [8]:
len(train_data)

557440

# Generator

In [9]:
def generator(data, bs):
    np.random.shuffle(data)
    for i in range(len(data)//bs):
        batch = data[bs*i: bs*(i+1)]

        x_gen = list(map(lambda k: np.pad(k, \
                ((40-len(k),0),(0,0)),'constant'), batch))
        mask_gen = list(map(lambda k: np.array(\
                [0 for i in range(40-len(k))] + \
                [1 for j in range(len(k))]), batch))

        yield np.array(list(x_gen)), np.array(list(mask_gen))

# Seq2seq Model

In [10]:
class rnnmodel(object):
    def __init__(self, lr=1e-4):
        title_len = 40
        em_dim = 100
        hidden_dim = 100
        beta = 1e-5
    
        self.x = tf.placeholder(tf.float32, [None, title_len, em_dim])
        self.mask = tf.placeholder(tf.float32, [None, title_len])
        
        encoder_cell = tf.contrib.rnn.LSTMCell(hidden_dim)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            encoder_cell, self.x,
            dtype=tf.float32, time_major=False,
        )
        
        self.code = encoder_final_state[1]
        code = tf.reshape(self.code, [-1, 1, hidden_dim])
        print('encoder final state:', encoder_final_state[1])
        print('code:', code)
        print(self.x)
        print(tf.tile(code, [1, title_len, 1]))
        print(encoder_final_state)
        
        decoder_cell = tf.contrib.rnn.LSTMCell(em_dim)
        decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
            decoder_cell, tf.tile(code, [1, title_len, 1]),
            initial_state = encoder_final_state,
            dtype=tf.float32, time_major=False, scope="plain_decoder",
        )
        print('decoder')
        
        x_ = tf.reverse(decoder_outputs, [1])
        l2_loss = tf.reduce_mean(tf.squared_difference(self.x, x_), axis=2)
        
        self.loss = tf.reduce_mean(l2_loss * self.mask)
        self.train = tf.train.AdamOptimizer(learning_rate=lr).minimize(self.loss)
        self.output = x_
        

In [12]:
tf.reset_default_graph()
g1 = tf.Graph()
with g1.as_default() as g:
    with g.name_scope('g1') as scope:
        model = rnnmodel(lr=1e-2)

        tfconfig = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
        sess=tf.Session(config=tfconfig)
        sess.run(tf.global_variables_initializer())

encoder final state: Tensor("g1/rnn/while/Exit_4:0", shape=(?, 100), dtype=float32)
code: Tensor("g1/Reshape:0", shape=(?, 1, 100), dtype=float32)
Tensor("g1/Placeholder:0", shape=(?, 40, 100), dtype=float32)
Tensor("g1/Tile:0", shape=(?, 40, 100), dtype=float32)
LSTMStateTuple(c=<tf.Tensor 'g1/rnn/while/Exit_3:0' shape=(?, 100) dtype=float32>, h=<tf.Tensor 'g1/rnn/while/Exit_4:0' shape=(?, 100) dtype=float32>)
decoder


In [13]:
bs=1000
num_steps=len(train_data)//bs
ne=5
print_step=100
for epoch in range(ne):
    print('Epoch: [%d]' %(epoch+1))
    count = 0
    avg_loss = 0
    
    for batch in generator(train_data, bs):
        loss, _ = sess.run([model.loss, model.train], \
                feed_dict={model.x: batch[0], model.mask: batch[1]})
        
        avg_loss += loss
        if (count+1)%print_step==0:
            print('steps:', count+1, 'loss-', avg_loss/bs)
        count += 1

1
100 0.012637195
200 0.01183142
300 0.011044705
400 0.010863708
500 0.010469265
2
100 0.01078619
200 0.009567589
300 0.009628285
400 0.010404469
500 0.009667555
3
100 0.010611333
200 0.009885514
300 0.009893315
400 0.009990467
500 0.009637642
4
100 0.00978215
200 0.009817317
300 0.009984107
400 0.009653988
500 0.009857389
5
100 0.009786381
200 0.0094190985
300 0.009666117
400 0.0094984425
500 0.009815341


# VAE Model

In [14]:
class VAE_2d(object):

    def __init__(self, n_z=2, learning_rate=1e-3, beta=100):
        self.learning_rate = learning_rate
        self.n_z = n_z
        self.beta = beta

        self.build()
        
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())

    # Build the netowrk and the loss functions
    def build(self):
        self.x = tf.placeholder(name='x', dtype=tf.float32, shape=[None, 100])

        # Encode
        # x -> z_mean, z_sigma -> z
        f1 = fc(self.x, 100, scope='enc_fc1', activation_fn=tf.nn.elu)
        f2 = fc(f1, 64, scope='enc_fc2', activation_fn=tf.nn.elu)
        f3 = fc(f2, 64, scope='enc_fc3', activation_fn=tf.nn.elu)
        f4 = fc(f3, 32, scope='enc_fc4', activation_fn=tf.nn.elu)
        f5 = fc(f4, 32, scope='enc_fc5', activation_fn=tf.nn.elu)
        f6 = fc(f5, 16, scope='enc_fc6', activation_fn=tf.nn.elu)
        f7 = fc(f6, 16, scope='enc_fc7', activation_fn=tf.nn.elu)
        f8 = fc(f7, 8, scope='enc_fc8', activation_fn=tf.nn.elu)
        f9 = fc(f8, 8, scope='enc_fc9', activation_fn=tf.nn.elu)
        f10 = fc(f9, 4, scope='enc_fc10', activation_fn=tf.nn.elu)
        self.z_mu = fc(f10, self.n_z, scope='enc_fc11_mu', activation_fn=None)
        self.z_log_sigma_sq = fc(f10, self.n_z, scope='enc_fc11_sigma', activation_fn=None)
        eps = tf.random_normal(shape=tf.shape(self.z_log_sigma_sq),
                               mean=0, stddev=0.001, dtype=tf.float32)
        self.z = self.z_mu + tf.sqrt(tf.exp(self.z_log_sigma_sq)) * eps

        # Decode
        # z -> x_hat
        g1 = fc(self.z, 4, scope='dec_fc1', activation_fn=tf.nn.elu)
        g2 = fc(g1, 8, scope='dec_fc2', activation_fn=tf.nn.elu)
        g3 = fc(g2, 8, scope='dec_fc3', activation_fn=tf.nn.elu)
        g4 = fc(g3, 16, scope='dec_fc4', activation_fn=tf.nn.elu)
        g5 = fc(g4, 16, scope='dec_fc5', activation_fn=tf.nn.elu)
        g6 = fc(g5, 32, scope='dec_fc6', activation_fn=tf.nn.elu)
        g7 = fc(g6, 32, scope='dec_fc7', activation_fn=tf.nn.elu)
        g8 = fc(g7, 64, scope='dec_fc8', activation_fn=tf.nn.elu)
        g9 = fc(g8, 64, scope='dec_fc9', activation_fn=tf.nn.elu)
        g10 = fc(g9, 100, scope='dec_fc10', activation_fn=tf.nn.elu)
        self.x_hat = fc(g10, 100, scope='dec_fc11', activation_fn=None)

        # Loss
        # Reconstruction loss
        # Minimize the cross-entropy loss
        # H(x, x_hat) = -\Sigma x*log(x_hat) + (1-x)*log(1-x_hat)
        epsilon = 1e-10
        #'''
        recon_loss = tf.reduce_mean(tf.squared_difference(self.x_hat, self.x))
        '''
        recon_loss = -tf.reduce_sum(
            self.x * tf.log(epsilon+self.x_hat) + (1-self.x) * tf.log(epsilon+1-self.x_hat),
            axis=1
        )
        #'''
        self.recon_loss = tf.reduce_mean(recon_loss)

        # Latent loss
        # Kullback Leibler divergence: measure the difference between two distributions
        # Here we measure the divergence between the latent distribution and N(0, 1)
        latent_loss = -0.5 * tf.reduce_sum(
            1 + self.z_log_sigma_sq - tf.square(self.z_mu) - tf.exp(self.z_log_sigma_sq), axis=1)
        self.latent_loss = tf.reduce_mean(latent_loss)

        self.total_loss = tf.reduce_mean(recon_loss + self.beta * latent_loss)
        self.train_op = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate).minimize(self.total_loss)
        return

    # Execute the forward and the backward pass
    def run_single_step(self, x):
        _, loss, recon_loss, latent_loss = self.sess.run(
            [self.train_op, self.total_loss, self.recon_loss, self.latent_loss],
            feed_dict={self.x: x}
        )
        return loss, recon_loss, latent_loss

    # x -> x_hat
    def reconstructor(self, x):
        x_hat = self.sess.run(self.x_hat, feed_dict={self.x: x})
        return x_hat

    # z -> x
    def generator(self, z):
        x_hat = self.sess.run(self.x_hat, feed_dict={self.z: z})
        return x_hat

    # x -> z
    def transformer(self, x):
        z = self.sess.run(self.z, feed_dict={self.x: x})
        return z
    
    def restore(self, saver, ckpt):
        saver.restore(self.sess, ckpt)

In [18]:
model_name = 'final/mg2_200'

tf.reset_default_graph()
g2 = tf.Graph()
with g2.as_default() as g:
    #with g.name_scope('g2') as scope:
    vae_model = VAE_2d(n_z=4, learning_rate=1e-4, beta=0.01)
    saver = tf.train.Saver()
    vae_model.restore(saver, './ckpt/%s.ckpt' %model_name)

INFO:tensorflow:Restoring parameters from ./ckpt/final/mg2_200.ckpt




# Evaluation

In [42]:
def generate_vae(word_list, model):
    title = [x for x in word_list if len(x)!=1]
    embed = embedding[title]
    x = np.pad(embed, ((40-len(embed),0), (0,0)), 'constant')
    mask = np.array([0 for i in range(40-len(embed))] + [1 for i in range(len(embed))])
    
    tf.reset_default_graph()
    tfconfig = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    #with tf.Session(graph=g1, config=tfconfig) as sess:
    code = sess.run(model.code, feed_dict={model.x: [np.array(list(x))], \
                       model.mask: [np.array(list(mask))]})
    
    code_recon = vae_model.reconstructor(code)
    
    tf.reset_default_graph()
    #with tf.Session(graph=g1, config=tfconfig) as sess:
    output = sess.run(model.output, feed_dict={model.x: [np.array(list(x))], \
           model.mask: [np.array(list(mask))], model.code: code_recon})
    return output[0][-len(embed):]

def generate_seq2seq(word_list, model):
    title = [x for x in word_list if len(x)!=1]
    embed = embedding[title]
    x = np.pad(embed, ((40-len(embed),0), (0,0)), 'constant')
    mask = np.array([0 for i in range(40-len(embed))] + [1 for i in range(len(embed))])
    
    tf.reset_default_graph()
    output = sess.run(model.output, feed_dict={model.x: [np.array(list(x))], \
                       model.mask: [np.array(list(mask))]})
    
    return output[0][-len(embed):]

def code_extraction(word_list, model):
    title = [x for x in word_list if len(x)!=1]
    embed = embedding[title]
    x = np.pad(embed, ((40-len(embed),0), (0,0)), 'constant')
    mask = np.array([0 for i in range(40-len(embed))] + [1 for i in range(len(embed))])
    
    tf.reset_default_graph()
    tfconfig = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    
    code = sess.run(model.code, feed_dict={model.x: [np.array(list(x))], \
                       model.mask: [np.array(list(mask))]})
    
    code_recon = vae_model.reconstructor(code)
    
    return code, code_recon

In [38]:
test_vae = generate_vae(total_data.lemmatized[1], model)
test_s2s = generate_seq2seq(total_data.lemmatized[1], model)

  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
code_extraction(total_data.lemmatized[1], model)



(array([[ 0.20027529, -0.13018477,  0.03980954,  0.21736607,  0.02524574,
         -0.02402929, -0.28230315,  0.09049254, -0.13874799, -0.09714895,
          0.02653462,  0.0269086 , -0.33080766,  0.5275354 ,  0.21938805,
          0.30270538,  0.14878947,  0.15191437, -0.2992179 , -0.00331125,
         -0.2815965 ,  0.11543564,  0.01201137,  0.03026124,  0.11828549,
         -0.32321078, -0.03908052, -0.08149406, -0.3764791 ,  0.22746891,
          0.03820552, -0.10167433,  0.05854119, -0.05453959,  0.12319708,
         -0.25092563, -0.06562562, -0.00290043,  0.11651237,  0.15291126,
         -0.02640409, -0.09764643,  0.14780325,  0.09023331,  0.06406042,
          0.08276749,  0.16384453, -0.00955445, -0.01579162, -0.23494753,
          0.0509443 ,  0.09332494,  0.15396006, -0.11410368, -0.25492015,
         -0.03342448, -0.10541394, -0.18522108,  0.0407915 ,  0.24781626,
          0.01616971,  0.06525198,  0.14882937, -0.27299055, -0.16267157,
          0.11947787,  0.3307075 , -0.

In [39]:
print(total_data.lemmatized[1])

['president', 'obama', 'want', 'give', 'young', 'leader', 'world', 'tool', 'organize']


In [40]:
for i in range(len(test_gen)):
    print(embedding.similar_by_vector(test_vae[i]))
    print()

[('abc_cancele', 0.5616289377212524), ('film_coco', 0.5578174591064453), ('sep_downcast', 0.5497655868530273), ('flu', 0.5437222719192505), ('fatigue_pain', 0.5226050615310669), ('barr_debacle', 0.52010178565979), ('include_pyelonephritis', 0.515978217124939), ('kidney_liver', 0.5144096612930298), ('nonsteroidal_inflammatory', 0.5140340328216553), ('severe_anemia', 0.5129023790359497)]

[('sep_downcast', 0.5242416262626648), ('hurricane_nate', 0.5188075304031372), ('abc_cancele', 0.5171298980712891), ('hurricane_impactlayoff', 0.5166088342666626), ('severe_anemia', 0.5149905681610107), ('effect_concomitant', 0.5131708979606628), ('dbrs_fake', 0.5089902877807617), ('stamo_tweet', 0.5051372051239014), ('reveal_resentation', 0.5012757778167725), ('allegedly_exorbitant', 0.5009552240371704)]

[('fallout', 0.5692722797393799), ('wo_insufficient', 0.5444861650466919), ('affected_barrage', 0.5437876582145691), ('unexpected_complication', 0.5355677604675293), ('spike', 0.5228429436683655), ('d

  
  if np.issubdtype(vec.dtype, np.int):


In [41]:
for i in range(len(test_gen)):
    print(embedding.similar_by_vector(test_s2s[i]))
    print()

[('insist', 0.7841520309448242), ('president_obama', 0.7695669531822205), ('press', 0.7526883482933044), ('supporter', 0.7513612508773804), ('congressionally', 0.7492458820343018), ('invite', 0.7473600506782532), ('openly', 0.7466476559638977), ('democratically_elect', 0.7462288737297058), ('white_house', 0.7457312345504761), ('obama', 0.7449266314506531)]

[('obama', 0.8969504237174988), ('president_obama', 0.8721156120300293), ('barack_obama', 0.868120551109314), ('leftist', 0.8426805734634399), ('politician', 0.8405747413635254), ('supporter', 0.8384084701538086), ('democratic_party', 0.836190938949585), ('communist_dictator', 0.8334490656852722), ('democratic_congresswoman', 0.8332608342170715), ('democracy', 0.8316212892532349)]

[('convince', 0.7757793664932251), ('matter', 0.7441308498382568), ('explain', 0.740172266960144), ('question', 0.7266321182250977), ('neither', 0.7199560403823853), ('decide', 0.7144235372543335), ('ask', 0.7121338844299316), ('indeed', 0.707891941070556

  
  if np.issubdtype(vec.dtype, np.int):
