In [1]:
import numpy as np
from math import *
from scipy import linalg,dot
from numpy.linalg import norm

In [2]:
from scripts.text_preprocessing import *

In [3]:
from scripts.lsa.vector_space import VectorSpace
from scripts.lsa.tfidf import TFIDF
from scripts.lsa.lsa import LSA

In [4]:
'''
Original documents:
["Dogs eat the same things that cats eat", 
"No dog is a mouse", "Mice eat little things", 
"Cats often play with rats and mice", 
"Cats often play, but not with other cats"]
'''
tokenized_docs = readPickle('embeddings/examples_3/lsa/tokenized_docs')

In [5]:
tokenized_docs

[['dogs', 'eat', 'the', 'same', 'things', 'that', 'cats', 'eat'],
 ['no', 'dog', 'is', 'a', 'mouse'],
 ['mice', 'eat', 'little', 'things'],
 ['cats', 'often', 'play', 'with', 'rats', 'and', 'mice'],
 ['cats', 'often', 'play', ',', 'but', 'not', 'with', 'other', 'cats']]

In [6]:
vocab = readPickle('embeddings/examples_3/lsa/vocab')

In [7]:
vocab

{0: 'zerostart',
 1: 'and',
 2: 'rats',
 3: 'often',
 4: 'is',
 5: 'mouse',
 6: 'little',
 7: 'no',
 8: 'things',
 9: ',',
 10: 'same',
 11: 'cats',
 12: 'dogs',
 13: 'play',
 14: 'that',
 15: 'but',
 16: 'not',
 17: 'mice',
 18: 'with',
 19: 'eat',
 20: 'a',
 21: 'dog',
 22: 'the',
 23: 'other',
 24: 'EOF',
 25: 'UNK'}

In [8]:
td_bow = readPickle('embeddings/examples_3/lsa/td_bow')

In [9]:
print(td_bow)

[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]]


In [10]:
td_bow_sublin = readPickle('embeddings/examples_3/lsa/td_bow_sublin')

In [11]:
print(td_bow_sublin)

[[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 1.0, 1.0, 1.0, 0, 1.0, 0, 0, 0, 0, 1.6931471805599454, 0, 0, 1.0, 0, 0, 0], [0, 0, 0, 0, 1.0, 1.0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 0, 0, 0, 0], [0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1.0, 0, 0, 0, 0, 0, 1.0, 0, 1.6931471805599454, 0, 1.0, 0, 1.0, 1.0, 0, 1.0, 0, 0, 0, 0, 1.0, 0, 0]]


In [12]:
td_tfidf = readPickle('embeddings/examples_3/lsa/td_tfidf')

In [13]:
print(td_tfidf)

[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.69314718  0.          1.09861229  0.40546511  1.09861229
   0.          1.09861229  0.          0.          0.          0.
   1.38629436  0.          0.          1.09861229  0.          0.          0.        ]
 [ 0.          0.          0.          0.          1.09861229  1.09861229
   0.          1.09861229  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   1.09861229  1.09861229  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   1.09861229  0.          0.69314718  0.          0.          0.          0.
   0.          0.          0.          0.          0.69314718  0.
   0.69314718  0.          0.          0.          0.          0.          0.        ]
 [ 0.          1.09861229  1.09861229  0.69314718  0.          0.          0.
   0.          

In [14]:
lsa_bow_u= readPickle('embeddings/examples_3/lsa/lsa_bow_u')
print("lsa_bow_u.shape: %s \n" %str(lsa_bow_u.shape))
#print(lsa_bow_u)

lsa_bow_u.shape: (26, 26) 



In [15]:
lsa_bow_sigma= readPickle('embeddings/examples_3/lsa/lsa_bow_sigma')
print("lsa_bow_sigma.shape: %s \n" %str(lsa_bow_sigma.shape))
print(lsa_bow_sigma)

lsa_bow_sigma.shape: (5,) 

[ 3.9465473   3.17222503  2.23606798  2.03926436  1.4843024 ]


In [16]:
lsa_bow_vt= readPickle('embeddings/examples_3/lsa/lsa_bow_vt')
bow_v = np.transpose(lsa_bow_vt)
print("bow_v.shape: %s \n" %str(bow_v.shape))
print(bow_v)

bow_v.shape: (5, 5) 

[[ -4.36291561e-01   8.10487196e-01  -0.00000000e+00  -2.12629270e-01
    3.27946600e-01]
 [  1.11022302e-16   0.00000000e+00  -1.00000000e+00  -5.55111512e-16
   -3.33066907e-16]
 [ -1.56003695e-01   3.59555393e-01  -2.22044605e-16   4.82523826e-01
   -7.83296575e-01]
 [ -4.96904838e-01  -2.51473057e-01  -6.66133815e-16   7.14415671e-01
    4.23623810e-01]
 [ -7.33756161e-01  -3.88061498e-01   3.33066907e-16  -4.59967173e-01
   -3.15341673e-01]]


In [17]:
svd_bow= readPickle('embeddings/examples_3/lsa/svd_bow')
svd_bow_t = np.transpose(svd_bow)
print("svd_bow_t.shape: %s \n" %str(svd_bow_t.shape))
#print(svd_bow_t)

svd_bow_t.shape: (5, 26) 



In [18]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_bow_diag_sigma = readPickle('embeddings/examples_3/lsa/lsa_bow_diag_sigma')
wv_bow = dot(lsa_bow_u, lsa_bow_diag_sigma)
print("wv_bow.shape: %s \n" %str(wv_bow.shape))
#print(wv_bow)

wv_bow.shape: (26, 5) 



In [19]:
'''
words: 
2: 'rats'
5: 'mouse',
11: 'cats',
12: 'dogs',
13: 'play',
17: 'mice',
19: 'eat',
21: 'dog',
'''
  
w=[2,5,11,12,13,17,19,21]
wv_bow_ = []
for i in range(len(wv_bow)):
    if i in w:
        wv_bow_.append(wv_bow[i])
wv_bow_

[array([ -4.96904838e-01,  -2.51473057e-01,  -9.89653958e-16,
          7.14415671e-01,   4.23623810e-01]),
 array([ -1.48118066e-16,  -1.33023088e-16,  -1.00000000e+00,
         -7.52707810e-16,  -1.72757327e-16]),
 array([ -2.40070872e+00,  -2.17108857e-01,   2.01330109e-16,
         -4.18147945e-01,   1.20887065e-01]),
 array([ -4.36291561e-01,   8.10487196e-01,   5.44763182e-17,
         -2.12629270e-01,   3.27946600e-01]),
 array([ -1.23066100e+00,  -6.39534555e-01,  -4.21400084e-16,
          2.54448498e-01,   1.08282138e-01]),
 array([ -6.52908533e-01,   1.08082336e-01,  -1.22019295e-15,
          1.19693950e+00,  -3.59672764e-01]),
 array([ -1.02858682e+00,   1.98052978e+00,  -1.21586355e-16,
          5.72652855e-02,  -1.27403375e-01]),
 array([ -1.48118066e-16,  -1.33023088e-16,  -1.00000000e+00,
         -7.52707810e-16,  -1.72757327e-16])]

In [20]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_bow_sigma_vt = linalg.diagsvd(lsa_bow_sigma, len(lsa_bow_sigma), len(lsa_bow_vt))
dv_bow = dot(bow_v, lsa_bow_sigma_vt)
print(dv_bow)

[[ -1.72184528e+00   2.57104777e+00   0.00000000e+00  -4.33607293e-01
    4.86771927e-01]
 [  4.38154768e-16   0.00000000e+00  -2.23606798e+00  -1.13201912e-15
   -4.94372011e-16]
 [ -6.15675960e-01   1.14059062e+00  -4.96506831e-16   9.83993642e-01
   -1.16264899e+00]
 [ -1.96105845e+00  -7.97729126e-01  -1.48952049e-15   1.45688242e+00
    6.28785840e-01]
 [ -2.89580340e+00  -1.23101840e+00   7.44760246e-16  -9.37994662e-01
   -4.68062403e-01]]


In [21]:
# word similarity as cosine distance pair
bow_ut = np.transpose(lsa_bow_u) 
wsim_bow = dot(lsa_bow_u,bow_ut)
# cat and cats
print("cat and cats: %s" %(wsim_bow[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_bow[6,14]))


cat and cats: -1.38777878078e-17
pet and pets: 5.55111512313e-17


In [22]:
# document similarity as cosine distance pair
dsim_bow = dot(bow_v,lsa_bow_vt)
print(dsim_bow)

[[  1.00000000e+00  -3.96332977e-17   1.66533454e-16   1.66533454e-16
    1.11022302e-16]
 [ -3.96332977e-17   1.00000000e+00   1.97760352e-16   7.32908596e-17
   -5.41672572e-17]
 [  1.66533454e-16   1.97760352e-16   1.00000000e+00   0.00000000e+00
   -2.77555756e-17]
 [  1.66533454e-16   7.32908596e-17   0.00000000e+00   1.00000000e+00
    3.60822483e-16]
 [  1.11022302e-16  -5.41672572e-17  -2.77555756e-17   3.60822483e-16
    1.00000000e+00]]


In [23]:
lsa_bow_sublin_u= readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_u')
print("lsa_bow_sublin_u.shape: %s \n" %str(lsa_bow_sublin_u.shape))
#print(lsa_bow_sublin_u)

lsa_bow_sublin_u.shape: (22, 22) 



In [24]:
lsa_bow_sublin_sigma= readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_sigma')
print("lsa_bow_sublin_sigma.shape: %s \n" %str(lsa_bow_sublin_sigma.shape))
print(lsa_bow_sublin_sigma)

lsa_bow_sublin_sigma.shape: (7,) 

[ 3.71761946  2.42512682  2.16184114  1.93473228  1.71953552  1.50538111
  1.23494615]


In [25]:
lsa_bow_sublin_vt= readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_vt')
print("lsa_bow_sublin_vt.shape: %s \n" %str(lsa_bow_sublin_vt.shape))
print(lsa_bow_sublin_vt)
lsa_bow_sublin_v = np.transpose(lsa_bow_sublin_vt)
print("\nlsa_bow_sublin_v.shape: %s \n" %str(lsa_bow_sublin_v.shape))
print(lsa_bow_sublin_v)

lsa_bow_sublin_vt.shape: (7, 7) 

[[-0.45693118 -0.5428162  -0.35212092 -0.26147774 -0.46719828 -0.02697564
  -0.29189519]
 [ 0.17760369 -0.54181256 -0.48093786  0.12372076  0.47444865  0.14779846
   0.42584284]
 [-0.4324054  -0.01675527  0.3500877  -0.09745159 -0.2173718   0.4082388
   0.68321095]
 [ 0.17962248 -0.23246256  0.10612664  0.83060045 -0.46040339  0.01908938
   0.01418701]
 [ 0.26799889  0.40830423 -0.52865997 -0.0668377  -0.31947279  0.61217109
  -0.02644431]
 [-0.09906438  0.32966008 -0.42172989  0.03703535 -0.24766232 -0.64544847
   0.47364797]
 [ 0.67734408 -0.28649961  0.2314042  -0.45944505 -0.36606417 -0.13801244
   0.20355565]]

lsa_bow_sublin_v.shape: (7, 7) 

[[-0.45693118  0.17760369 -0.4324054   0.17962248  0.26799889 -0.09906438
   0.67734408]
 [-0.5428162  -0.54181256 -0.01675527 -0.23246256  0.40830423  0.32966008
  -0.28649961]
 [-0.35212092 -0.48093786  0.3500877   0.10612664 -0.52865997 -0.42172989
   0.2314042 ]
 [-0.26147774  0.12372076 -0.09745159  0.8

In [26]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_bow_sublin_diag_sigma = readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_diag_sigma')
wv_bow_sublin = dot(lsa_bow_sublin_u, lsa_bow_sublin_diag_sigma)
print("wv_bow_sublin.shape: %s \n" %str(wv_bow_sublin.shape))
#print(wv_bow_sublin)

wv_bow_sublin.shape: (22, 7) 



In [27]:
'''
words: 
2: 'rats'
5: 'mouse',
11: 'cats',
12: 'dogs',
13: 'play',
17: 'mice',
19: 'eat',
21: 'dog',
'''
  
w=[2,5,11,12,13,17,19,21]
wv_bow_sublin_ = []
for i in range(len(wv_bow_sublin)):
    if i in w:
        wv_bow_sublin_.append(wv_bow_sublin[i])
wv_bow_sublin_

[array([-0.35212092, -0.48093786,  0.3500877 ,  0.10612664, -0.52865997,
        -0.42172989,  0.2314042 ]),
 array([-0.29189519,  0.42584284,  0.68321095,  0.01418701, -0.02644431,
         0.47364797,  0.20355565]),
 array([-0.89493712, -1.02275042,  0.33333244, -0.12633592, -0.12035574,
        -0.09206981, -0.05509541]),
 array([-0.92412946,  0.65205234, -0.6497772 , -0.28078091, -0.05147391,
        -0.3467267 ,  0.31127991]),
 array([-0.02697564,  0.14779846,  0.4082388 ,  0.01908938,  0.61217109,
        -0.64544847, -0.13801244]),
 array([-0.71840892,  0.30132445, -0.52985699,  1.01022293,  0.20116118,
        -0.06202903,  0.21789903]),
 array([-0.89493712, -1.02275042,  0.33333244, -0.12633592, -0.12035574,
        -0.09206981, -0.05509541]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.])]

In [28]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_bow_sublin_sigma_vt = linalg.diagsvd(lsa_bow_sublin_sigma, len(lsa_bow_sublin_sigma), len(lsa_bow_sublin_vt))
dv_bow_sublin = dot(lsa_bow_sublin_v, lsa_bow_sublin_sigma_vt)
print(dv_bow_sublin)

[[-1.69869626  0.43071147 -0.93479178  0.34752141  0.4608336  -0.14912965
   0.83648347]
 [-2.01798409 -1.31396417 -0.03622222 -0.44975282  0.70209363  0.49626406
  -0.35381159]
 [-1.30905157 -1.1663353   0.756834    0.20532664 -0.9090496  -0.63486422
   0.28577173]
 [-0.97207472  0.30003853 -0.21067486  1.6069895  -0.11492981  0.05575232
  -0.5673899 ]
 [-1.73686541  1.15059815 -0.4699233  -0.8907573  -0.54934481 -0.37282617
  -0.45206954]
 [-0.10028517  0.35843     0.88254743  0.03693283  1.05264994 -0.97164594
  -0.17043793]
 [-1.08515522  1.0327229   1.47699353  0.02744808 -0.04547193  0.71302071
   0.25138026]]


In [29]:
# word similarity as cosine distance pair
bow_sublin_ut = np.transpose(lsa_bow_sublin_u) 
wsim_bow_sublin = dot(lsa_bow_sublin_u,bow_sublin_ut)
# cat and cats
print("cat and cats: %s" %(wsim_bow_sublin[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_bow_sublin[6,14]))

cat and cats: 1.11022302463e-16
pet and pets: 8.32667268469e-17


In [30]:
# document similarity as cosine distance pair
dsim_bow_sublin = dot(lsa_bow_sublin_v,lsa_bow_sublin_vt)
print(dsim_bow_sublin)

[[  1.00000000e+00  -1.38777878e-16   8.32667268e-17   0.00000000e+00
   -8.32667268e-17  -6.93889390e-17   5.55111512e-17]
 [ -1.38777878e-16   1.00000000e+00   5.55111512e-17  -3.88578059e-16
   -5.55111512e-17   1.59594560e-16  -2.91433544e-16]
 [  8.32667268e-17   5.55111512e-17   1.00000000e+00   2.77555756e-16
   -4.44089210e-16  -1.24900090e-16  -2.08166817e-16]
 [  0.00000000e+00  -3.88578059e-16   2.77555756e-16   1.00000000e+00
   -6.66133815e-16   9.71445147e-17   2.49800181e-16]
 [ -8.32667268e-17  -5.55111512e-17  -4.44089210e-16  -6.66133815e-16
    1.00000000e+00  -1.31838984e-16   1.24900090e-16]
 [ -6.93889390e-17   1.59594560e-16  -1.24900090e-16   9.71445147e-17
   -1.31838984e-16   1.00000000e+00   3.08780779e-16]
 [  5.55111512e-17  -2.91433544e-16  -2.08166817e-16   2.49800181e-16
    1.24900090e-16   3.08780779e-16   1.00000000e+00]]


In [31]:
lsa_tfidf_u= readPickle('embeddings/examples_3/lsa/lsa_tfidf_u')
print("lsa_tfidf_u.shape: %s \n" %str(lsa_tfidf_u.shape))
#print(lsa_tfidf_u)

lsa_tfidf_u.shape: (26, 26) 



In [32]:
lsa_tfidf_sigma= readPickle('embeddings/examples_3/lsa/lsa_tfidf_sigma')
print("lsa_tfidf_sigma.shape: %s \n" %str(lsa_tfidf_sigma.shape))
print(lsa_tfidf_sigma)

lsa_tfidf_sigma.shape: (5,) 

[ 2.87170563  2.7232046   2.45657176  1.91891303  1.45769741]


In [33]:
lsa_tfidf_vt= readPickle('embeddings/examples_3/lsa/lsa_tfidf_vt')
print("lsa_tfidf_vt.shape: %s \n" %str(lsa_tfidf_vt.shape))
print(lsa_tfidf_vt)
lsa_tfidf_v = np.transpose(lsa_tfidf_vt)
print("\nlsa_tfidf_v.shape: %s \n" %str(lsa_tfidf_v.shape))
print(lsa_tfidf_v)

lsa_tfidf_vt.shape: (5, 5) 

[[ -6.50970531e-01   2.22044605e-16  -1.98190949e-01  -3.56469862e-01
   -6.40224143e-01]
 [  7.09117686e-01   2.35922393e-16   1.84107459e-01  -3.00454720e-01
   -6.10723760e-01]
 [ -0.00000000e+00  -1.00000000e+00   1.11022302e-16  -7.77156117e-16
    1.11022302e-16]
 [ -9.76078840e-02  -5.82867088e-16   2.57950719e-01   8.47925183e-01
   -4.52721782e-01]
 [  2.52709667e-01  -2.22044605e-16  -9.27516155e-01   2.52347147e-01
   -1.10329161e-01]]

lsa_tfidf_v.shape: (5, 5) 

[[ -6.50970531e-01   7.09117686e-01  -0.00000000e+00  -9.76078840e-02
    2.52709667e-01]
 [  2.22044605e-16   2.35922393e-16  -1.00000000e+00  -5.82867088e-16
   -2.22044605e-16]
 [ -1.98190949e-01   1.84107459e-01   1.11022302e-16   2.57950719e-01
   -9.27516155e-01]
 [ -3.56469862e-01  -3.00454720e-01  -7.77156117e-16   8.47925183e-01
    2.52347147e-01]
 [ -6.40224143e-01  -6.10723760e-01   1.11022302e-16  -4.52721782e-01
   -1.10329161e-01]]


In [34]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_tfidf_diag_sigma = readPickle('embeddings/examples_3/lsa/lsa_tfidf_diag_sigma')
wv_tfidf = dot(lsa_tfidf_u, lsa_tfidf_diag_sigma)
print("wv_tfidf.shape: %s \n" %str(wv_tfidf.shape))
print(wv_tfidf)

wv_tfidf.shape: (26, 5) 

[[  1.75970182e-17   1.42257986e-17   5.64785223e-32  -5.86966218e-17
   -1.72604324e-17]
 [ -3.91622171e-01  -3.30083248e-01  -9.51479423e-16   9.31541026e-01
    2.77231677e-01]
 [ -3.91622171e-01  -3.30083248e-01  -9.51479423e-16   9.31541026e-01
    2.77231677e-01]
 [ -6.90855640e-01  -6.31580795e-01  -5.62343992e-16   2.73934123e-01
    9.84393668e-02]
 [  3.87311840e-16   4.33155244e-16  -1.09861229e+00  -5.63546648e-16
   -2.90191907e-16]
 [  2.98845895e-16   3.48765652e-16  -1.09861229e+00  -6.26103583e-16
   -3.05437151e-16]
 [ -2.17735012e-01   2.02262717e-01   3.76590784e-16   2.83387830e-01
   -1.01898065e+00]
 [  2.98845895e-16   3.48765652e-16  -1.09861229e+00  -6.26103583e-16
   -3.05437151e-16]
 [ -5.88593885e-01   6.19136491e-01   2.02946318e-16   1.11141184e-01
   -4.67740214e-01]
 [ -7.03358111e-01  -6.70948628e-01   6.01852824e-17  -4.97365713e-01
   -1.21208972e-01]
 [ -7.15164225e-01   7.79045404e-01  -5.49284808e-17  -1.07233221e-01
    

In [35]:
'''
words: 
2: 'rats'
5: 'mouse',
11: 'cats',
12: 'dogs',
13: 'play',
17: 'mice',
19: 'eat',
21: 'dog',
'''
  
w=[2,5,11,12,13,17,19,21]
wv_tfidf_ = []
for i in range(len(wv_tfidf)):
    if i in w:
        wv_tfidf_.append(wv_tfidf[i])
wv_tfidf_

[array([ -3.91622171e-01,  -3.30083248e-01,  -9.51479423e-16,
          9.31541026e-01,   2.77231677e-01]),
 array([  2.98845895e-16,   3.48765652e-16,  -1.09861229e+00,
         -6.26103583e-16,  -3.05437151e-16]),
 array([ -9.27659031e-01,  -3.29555777e-01,  -3.27010019e-16,
         -6.28982879e-02,   1.15313666e-01]),
 array([ -7.15164225e-01,   7.79045404e-01,  -5.49284808e-17,
         -1.07233221e-01,   2.77629946e-01]),
 array([ -6.90855640e-01,  -6.31580795e-01,  -5.62343992e-16,
          2.73934123e-01,   9.84393668e-02]),
 array([ -3.84461577e-01,  -8.06457759e-02,  -3.62714347e-16,
          7.66534764e-01,  -4.67991494e-01]),
 array([ -1.03981227e+00,   1.11065942e+00,   1.68290305e-16,
          4.34845546e-02,  -2.92575221e-01]),
 array([  2.98845895e-16,   3.48765652e-16,  -1.09861229e+00,
         -6.26103583e-16,  -3.05437151e-16])]

In [36]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_tfidf_sigma_vt = linalg.diagsvd(lsa_tfidf_sigma, len(lsa_tfidf_sigma), len(lsa_tfidf_vt))
dv_tfidf = dot(lsa_tfidf_v, lsa_tfidf_sigma_vt)
print(dv_tfidf)

[[ -1.86939574e+00   1.93107255e+00   0.00000000e+00  -1.87301040e-01
    3.68374228e-01]
 [  6.37646743e-16   6.42464946e-16  -2.45657176e+00  -1.11847125e-15
   -3.23673846e-16]
 [ -5.69146063e-01   5.01362281e-01   2.72734253e-16   4.94984996e-01
   -1.35203790e+00]
 [ -1.02367651e+00  -8.18199678e-01  -1.90913977e-15   1.62709468e+00
    3.67845783e-01]
 [ -1.83853528e+00  -1.66312575e+00   2.72734253e-16  -8.68733726e-01
   -1.60826532e-01]]


In [37]:
# word similarity as cosine distance pair
lsa_tfidf_ut = np.transpose(lsa_tfidf_u) 
wsim_tfidf = dot(lsa_tfidf_u,lsa_tfidf_ut)
# cat and cats
print("cat and cats: %s" %(wsim_tfidf[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_tfidf[6,14]))

cat and cats: 4.16333634234e-17
pet and pets: 2.42861286637e-17


In [38]:
print("%s \n"%wv_tfidf[6])
print("%s \n"%wv_tfidf[14])
sim = float(dot(wv_tfidf[6],wv_tfidf[14]) / (norm(wv_tfidf[6]) * norm(wv_tfidf[14])))
sim

[ -2.17735012e-01   2.02262717e-01   3.76590784e-16   2.83387830e-01
  -1.01898065e+00] 

[ -7.15164225e-01   7.79045404e-01  -5.49284808e-17  -1.07233221e-01
   2.77629946e-01] 



4.599295664821215e-17

In [39]:
# document similarity as cosine distance pair
dsim_tfidf = dot(lsa_tfidf_v,lsa_tfidf_vt)
print(dsim_tfidf)

[[  1.00000000e+00   2.35318519e-17  -5.55111512e-17   1.38777878e-17
    1.17961196e-16]
 [  2.35318519e-17   1.00000000e+00  -5.59954874e-17   7.68599063e-17
   -1.08889408e-16]
 [ -5.55111512e-17  -5.59954874e-17   1.00000000e+00   2.77555756e-17
    1.38777878e-16]
 [  1.38777878e-17   7.68599063e-17   2.77555756e-17   1.00000000e+00
   -2.49800181e-16]
 [  1.17961196e-16  -1.08889408e-16   1.38777878e-16  -2.49800181e-16
    1.00000000e+00]]


In [40]:
w2v_1 = readPickle('embeddings/examples_3/w2v/w2v_embed1')
w2v_1.shape

(26, 5)

In [41]:
w2v_2 = readPickle('embeddings/examples_3/w2v/w2v_embed2')
w2v_2.shape

(26, 5)

In [42]:
avg_embed1 = readPickle('embeddings/examples_3/w2v/avg_embed1')
avg_embed1.shape

(5, 5)

In [43]:
avg_embed2 = readPickle('embeddings/examples_3/w2v/avg_embed2')
avg_embed2.shape

(5, 5)