In [1]:
from text_loader_utils import TextLoader
import pickle as cPickle
import numpy as np
import tensorflow.compat.v1 as tf
from variational_model import NVDM
from vector_utils import find_norm
from tf_common_utils import load_model , save_model

np.random.seed(0)
tf.set_random_seed(0)

def xavier_init(fan_in , fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
#     fan_in = in_and_out[0]
#     fan_out = in_and_out[1]
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train')   
data_ = twenty_train.data
print("Download 20 news group data completed")
A = TextLoader(data_ , min_count = 25)
batch_size = 100
    # restricting memory usage, TensorFlow is greedy and will use all memory otherwise
gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
# initialize the Session

Download 20 news group data completed


In [3]:
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_opts))

In [4]:
vae = NVDM(sess , len(A.vocab), 50, [500 , 500] ,  
                         transfer_fct=tf.nn.tanh , output_activation=tf.nn.softmax,
                         batch_size=100, initializer=xavier_init )

In [5]:
vae.start_the_model()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [6]:
load_model(vae)


 [*] Loading checkpoints...
Model dir is save_my_model
 ckpt name  NVDM-1000
INFO:tensorflow:Restoring parameters from C:\Users\liana\Documents\GitHub\textsim\vae\Deep-Learning-Projects\variational_text_inference\save_my_model\NVDM-1000
 [*] Load SUCCESS


True

In [7]:
embedding_matrix = vae.SESS.run(vae.Weights_generator['out_mean'])

In [8]:
embedding_matrix.shape

(50, 8466)

In [9]:
embedding_matrix = embedding_matrix.transpose()
embedding_matrix = find_norm(embedding_matrix) ####### Normalizing the matrix helps to find cosine similarity so fast

In [44]:
from vector_utils import find_similar
print(embedding_matrix)

sims, idx = find_similar(embedding_matrix, embedding_matrix[A.vocab["thing"]])
print(idx[:9])
words = [A.vocab_inverse[x] for x in idx[:9]]
print(words)

[[ 0.18270917 -0.20933783  0.15996052 ... -0.10757259 -0.07899398
   0.17410365]
 [ 0.31164202  0.0486914   0.03740434 ...  0.02041274 -0.18702523
   0.15251045]
 [-0.11786717  0.1953356  -0.10596441 ...  0.17624044 -0.1335527
  -0.24091434]
 ...
 [-0.24149787 -0.05941224 -0.05474815 ...  0.03185274 -0.18754104
  -0.12645514]
 [ 0.12785831 -0.22319151  0.1011028  ... -0.09852819 -0.09301905
   0.12620029]
 [-0.09953024  0.15434173 -0.05730319 ... -0.1285125   0.11386734
  -0.0882315 ]]
[   1 2324 2943 6328 4488 1506  693 2446 4761]
['thing', 'ariane', 'publisher', 'bissell', 'marlin', 'refer', '4', 'cds', '3000']


In [27]:
from vector_utils import find_similar
def word_match(norm_mat , word_ , vocab, vocab_inverse , topN = 10):
    
    idx = vocab[word_]
    similarity_meas , indexes = find_similar(norm_mat , norm_mat[idx])
    words = [vocab_inverse[i_x_] for i_x_ in indexes[:topN-1]]
    return zip(words , similarity_meas[:topN-1])

In [28]:
word_match(embedding_matrix , 'football' , A.vocab , A.vocab_inverse  )

KeyError: 7898

In [52]:
################# Pickup the hidden dimensions for all 20 news groups
batch_size = 100
H_20_grp_nws = []
batch_data = A.get_batch(batch_size)
batch_id = 0
#for i in A.get_batch(100):
#    for x in i:
        #print(x)
        
for batch_ in batch_data:

            batch_id += 1
            collected_data = [chunks for chunks in batch_]
            #print(collected_data)
            #print(A._bag_of_words(collected_data))
            batch_xs , mask_xs , mask_negative  = A._bag_of_words(collected_data)
            
            print(batch_xs)
            print(mask_xs)
            feed_dict = {vae.X: batch_xs , vae.dynamic_batch_size:batch_xs.shape[0],
                                            vae.MASK:mask_xs}
            
            h_batch    = vae.SESS.run(fetches = vae.z, feed_dict= feed_dict)
            H_20_grp_nws.extend(h_batch)

H_20_grp_nws = np.array(H_20_grp_nws)
from sklearn import manifold
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
%time H_tsne = tsne.fit_transform(H_20_grp_nws) ########### Converting to tsne



[[ 1  1 10 ...  0  0  0]
 [ 0  0 14 ...  0  0  0]
 [ 1  0 24 ...  0  0  0]
 ...
 [ 0  0 11 ...  0  0  0]
 [ 0  0  6 ...  0  0  0]
 [ 0  1  9 ...  0  0  0]]
[[1 1 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]]


ValueError: Cannot feed value of shape () for Tensor 'Placeholder:0', which has shape '(None, 7162)'

In [None]:


import matplotlib.pyplot as plt

%matplotlib inline


############ Picking only non - null values after pre-processing ( The non null indexes of data is in self.data_index)
newsgroups_target = [twenty_train.target[i] for i in A.data_index]
########### Converting targets to One-hot-K Vectors by tf.one_hot for visualization purposes
news_target_one_hot = sess.run(tf.one_hot(newsgroups_target , depth=len(set(newsgroups_target))))



plt.figure(figsize=(8, 6)) 
plt.scatter(H_tsne[:, 0], H_tsne[:, 1], c=np.argmax(news_target_one_hot, 1))
plt.colorbar()


In [21]:

def find_similiar_docs(index, topN = 10):

    a , b = find_similar(find_norm(H_20_grp_nws), find_norm(H_20_grp_nws)[index])
    source = data_[index]
    similar_results = [data_[i] for i in b[1:topN]]
    
    return source , zip(similar_results, a[1:topN])

In [22]:
query , res = find_similiar_docs(1797)

NameError: name 'H_20_grp_nws' is not defined

In [None]:
query

In [None]:
res