In [1]:
import numpy as np
from math import *
from scipy import linalg,dot
from numpy.linalg import norm

In [2]:
from scripts.text_preprocessing import *

In [3]:
from scripts.lsa.vector_space import VectorSpace
from scripts.lsa.tfidf import TFIDF
from scripts.lsa.lsa import LSA

In [4]:
'''
Original documents:
["The cat in the hat disabled", 
"A cat is a fine pet ponies.", 
"Dogs and cats make good pets.",
"I haven't got a hat."]
'''
tokenized_docs = readPickle('embeddings/examples_1/lsa/tokenized_docs')

In [5]:
tokenized_docs

[['the', 'cat', 'in', 'the', 'hat', 'disabled'],
 ['a', 'cat', 'is', 'a', 'fine', 'pet', 'ponies', '.'],
 ['dogs', 'and', 'cats', 'make', 'good', 'pets', '.'],
 ['i', 'have', "n't", 'got', 'a', 'hat', '.']]

In [6]:
vocab = readPickle('embeddings/examples_1/lsa/vocab')

In [7]:
vocab

{0: 'zerostart',
 1: 'a',
 2: 'and',
 3: 'good',
 4: 'dogs',
 5: "n't",
 6: 'pet',
 7: 'fine',
 8: 'is',
 9: 'cat',
 10: 'disabled',
 11: 'i',
 12: 'have',
 13: 'cats',
 14: 'pets',
 15: 'in',
 16: 'got',
 17: 'the',
 18: '.',
 19: 'hat',
 20: 'ponies',
 21: 'make',
 22: 'EOF',
 23: 'UNK'}

In [8]:
td_bow = readPickle('embeddings/examples_1/lsa/td_bow')

In [9]:
td_bow

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0],
 [0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
 [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0]]

In [10]:
td_bow_sublin = readPickle('embeddings/examples_1/lsa/td_bow_sublin')

In [11]:
print(td_bow_sublin)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0, 1.0, 0, 1.6931471805599454, 0, 1.0, 0, 0, 0, 0], [0, 1.6931471805599454, 0, 0, 0, 0, 1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 0], [0, 0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 1.0, 0, 0, 1.0, 0, 0], [0, 1.0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 1.0, 0, 1.0, 1.0, 0, 0, 0, 0]]


In [12]:
td_tfidf = readPickle('embeddings/examples_1/lsa/td_tfidf')

In [13]:
print(td_tfidf)

[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.51082562  0.91629073  0.          0.          0.
   0.          0.91629073  0.          1.83258146  0.          0.51082562
   0.          0.          0.          0.        ]
 [ 0.          1.02165125  0.          0.          0.          0.
   0.91629073  0.91629073  0.91629073  0.51082562  0.          0.          0.
   0.          0.          0.          0.          0.          0.22314355
   0.          0.91629073  0.          0.          0.        ]
 [ 0.          0.          0.91629073  0.91629073  0.91629073  0.          0.
   0.          0.          0.          0.          0.          0.
   0.91629073  0.91629073  0.          0.          0.          0.22314355
   0.          0.          0.91629073  0.          0.        ]
 [ 0.          0.51082562  0.          0.          0.          0.91629073
   0.          0.          0.          0.          0.          0.91629073
   0.9162

In [14]:
lsa_bow_u= readPickle('embeddings/examples_1/lsa/lsa_bow_u')
print("lsa_bow_u.shape: %s \n" %str(lsa_bow_u.shape))
print(lsa_bow_u)

lsa_bow_u.shape: (24, 24) 

[[  2.12725937e-17  -4.67366214e-18   7.58567501e-18  -1.45453718e-17
   -2.05998413e-18  -1.73472348e-18  -4.33680869e-19  -7.58941521e-19
   -4.33680869e-19  -3.53553391e-01  -3.53553391e-01  -1.73472348e-18
   -1.73472348e-18  -1.19262239e-18  -1.19262239e-18  -3.53553391e-01
   -1.73472348e-18  -7.07106781e-01   4.33680869e-18  -3.53553391e-01
   -7.58941521e-19  -1.40946282e-18   0.00000000e+00   0.00000000e+00]
 [ -5.83178680e-01   1.61775963e-01  -2.76918407e-01  -1.71928139e-02
    2.59430267e-02  -6.28937407e-02  -2.96408612e-01  -2.96408612e-01
   -2.96408612e-01  -2.51495818e-01   4.49127941e-02  -6.28937407e-02
   -6.28937407e-02   2.59430267e-02   2.59430267e-02   4.49127941e-02
   -6.28937407e-02   8.98255881e-02  -3.33359326e-01  -1.79809466e-02
   -2.96408612e-01   2.59430267e-02   0.00000000e+00   0.00000000e+00]
 [ -6.47780377e-02   1.20159696e-01   3.45670049e-01  -9.27456508e-02
   -3.70714629e-01  -1.34559753e-01   7.30463184e-02   7.304

In [15]:
lsa_bow_sigma= readPickle('embeddings/examples_1/lsa/lsa_bow_sigma')
print("lsa_bow_sigma.shape: %s \n" %str(lsa_bow_sigma.shape))
print(lsa_bow_sigma)

lsa_bow_sigma.shape: (4,) 

[ 3.55068156  2.78089472  2.57483534  2.24265646]


In [16]:
lsa_bow_vt= readPickle('embeddings/examples_1/lsa/lsa_bow_vt')
bow_v = np.transpose(lsa_bow_vt)
print("bow_v.shape: %s \n" %str(bow_v.shape))
print(bow_v)

bow_v.shape: (4, 4) 

[[-0.27992788 -0.91911441  0.24048242 -0.1379757 ]
 [-0.78095902  0.20482344 -0.38350477 -0.44841328]
 [-0.23000618  0.33415146  0.89004346 -0.20799663]
 [-0.50876375  0.04023503  0.05399023  0.85826898]]


In [17]:
svd_bow= readPickle('embeddings/examples_1/lsa/svd_bow')
svd_bow_t = np.transpose(svd_bow)
print("svd_bow_t.shape: %s \n" %str(svd_bow_t.shape))
print(svd_bow_t)

svd_bow_t.shape: (4, 24) 

[[ -4.85334346e-32   1.58293517e-15  -7.84095011e-16  -8.04911693e-16
   -7.94503352e-16   1.11022302e-16   7.00828284e-16   7.00828284e-16
    7.00828284e-16   1.00000000e+00   1.00000000e+00   1.11022302e-16
    1.11022302e-16  -7.94503352e-16  -7.94503352e-16   1.00000000e+00
    1.11022302e-16   2.00000000e+00   6.24500451e-17   1.00000000e+00
    7.00828284e-16  -7.94503352e-16   0.00000000e+00   0.00000000e+00]
 [ -5.45128399e-17   2.00000000e+00  -2.63677968e-15  -2.67841305e-15
   -2.69229083e-15   1.66533454e-16   1.00000000e+00   1.00000000e+00
    1.00000000e+00   1.00000000e+00  -3.46944695e-16   1.66533454e-16
    1.66533454e-16  -2.69229083e-15  -2.69229083e-15  -3.46944695e-16
    1.66533454e-16  -6.93889390e-16   1.00000000e+00  -2.22044605e-16
    1.00000000e+00  -2.69229083e-15   0.00000000e+00   0.00000000e+00]
 [  2.45328606e-18   1.28543010e-15   1.00000000e+00   1.00000000e+00
    1.00000000e+00   5.55111512e-17   6.24500451e-16   6.2450

In [18]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_bow_diag_sigma = readPickle('embeddings/examples_1/lsa/lsa_bow_diag_sigma')
wv_bow = dot(lsa_bow_u, lsa_bow_diag_sigma)
print("wv_bow.shape: %s \n" %str(wv_bow.shape))
print(wv_bow)

wv_bow.shape: (24, 4) 

[[  7.55322063e-17  -1.29969624e-17   1.95318641e-17  -3.26202720e-17]
 [ -2.07068178e+00   4.49881920e-01  -7.13019300e-01  -3.85575751e-02]
 [ -2.30006184e-01   3.34151464e-01   8.90043457e-01  -2.07996633e-01]
 [ -2.30006184e-01   3.34151464e-01   8.90043457e-01  -2.07996633e-01]
 [ -2.30006184e-01   3.34151464e-01   8.90043457e-01  -2.07996633e-01]
 [ -5.08763753e-01   4.02350327e-02   5.39902329e-02   8.58268979e-01]
 [ -7.80959015e-01   2.04823444e-01  -3.83504767e-01  -4.48413277e-01]
 [ -7.80959015e-01   2.04823444e-01  -3.83504767e-01  -4.48413277e-01]
 [ -7.80959015e-01   2.04823444e-01  -3.83504767e-01  -4.48413277e-01]
 [ -1.06088689e+00  -7.14290966e-01  -1.43022348e-01  -5.86388973e-01]
 [ -2.79927875e-01  -9.19114410e-01   2.40482418e-01  -1.37975696e-01]
 [ -5.08763753e-01   4.02350327e-02   5.39902329e-02   8.58268979e-01]
 [ -5.08763753e-01   4.02350327e-02   5.39902329e-02   8.58268979e-01]
 [ -2.30006184e-01   3.34151464e-01   8.90043457e-01 

In [19]:
# words: dogs, pet, cat, I, cats, pets, hat, ponies 
w=[4,6,9,11,13,14,19,20]
wv_bow_ = []
for i in range(len(wv_bow)):
    if i in w:
        wv_bow_.append(wv_bow[i])
wv_bow_

[array([-0.23000618,  0.33415146,  0.89004346, -0.20799663]),
 array([-0.78095902,  0.20482344, -0.38350477, -0.44841328]),
 array([-1.06088689, -0.71429097, -0.14302235, -0.58638897]),
 array([-0.50876375,  0.04023503,  0.05399023,  0.85826898]),
 array([-0.23000618,  0.33415146,  0.89004346, -0.20799663]),
 array([-0.23000618,  0.33415146,  0.89004346, -0.20799663]),
 array([-0.78869163, -0.87887938,  0.29447265,  0.72029328]),
 array([-0.78095902,  0.20482344, -0.38350477, -0.44841328])]

In [20]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_bow_sigma_vt = linalg.diagsvd(lsa_bow_sigma, len(lsa_bow_sigma), len(lsa_bow_vt))
dv_bow = dot(bow_v, lsa_bow_sigma_vt)
print(dv_bow)

[[-0.99393475 -2.55596041  0.61920263 -0.30943209]
 [-2.77293677  0.56959243 -0.98746163 -1.00563693]
 [-0.81667872  0.92924004  2.29171535 -0.46646499]
 [-1.80645808  0.11188939  0.13901596  1.92480247]]


In [21]:
# word similarity as cosine distance pair
bow_ut = np.transpose(lsa_bow_u) 
wsim_bow = dot(lsa_bow_u,bow_ut)
# cat and cats
print("cat and cats: %s" %(wsim_bow[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_bow[6,14]))


cat and cats: 3.12250225676e-17
pet and pets: 2.42861286637e-17


In [22]:
# document similarity as cosine distance pair
dsim_bow = dot(bow_v,lsa_bow_vt)
print(dsim_bow)

[[  1.00000000e+00   4.85722573e-17  -3.12250226e-17   1.24900090e-16]
 [  4.85722573e-17   1.00000000e+00  -2.49800181e-16   1.11022302e-16]
 [ -3.12250226e-17  -2.49800181e-16   1.00000000e+00   1.38777878e-16]
 [  1.24900090e-16   1.11022302e-16   1.38777878e-16   1.00000000e+00]]


In [23]:
lsa_bow_sublin_u= readPickle('embeddings/examples_1/lsa/lsa_bow_sublin_u')
print("lsa_bow_sublin_u.shape: %s \n" %str(lsa_bow_sublin_u.shape))
print(lsa_bow_sublin_u)

lsa_bow_sublin_u.shape: (24, 24) 

[[  2.65551009e-18   5.71252009e-19  -3.23833997e-18  -2.15317806e-17
    0.00000000e+00   9.54097912e-18  -1.95156391e-18   2.16840434e-18
   -1.95156391e-18  -3.81614146e-01  -3.81614146e-01   9.54097912e-18
    9.54097912e-18   0.00000000e+00   4.33680869e-19  -3.81614146e-01
    9.54097912e-18  -6.46128915e-01  -2.60208521e-18  -3.81614146e-01
    2.16840434e-18   4.33680869e-19   0.00000000e+00   0.00000000e+00]
 [ -5.28402380e-01  -2.72737857e-02  -3.16132369e-01   3.60652807e-03
    2.61031689e-02  -7.63962847e-02  -3.09897114e-01  -3.09897114e-01
   -3.09897114e-01  -2.53641455e-01   5.62556591e-02  -7.63962847e-02
   -7.63962847e-02   2.61031689e-02   2.61031689e-02   5.62556591e-02
   -7.63962847e-02   9.52491105e-02  -3.60190230e-01  -2.01406256e-02
   -3.09897114e-01   2.61031689e-02   0.00000000e+00   0.00000000e+00]
 [ -8.19007561e-02   2.86251309e-01   2.28808024e-01  -7.73712728e-02
   -3.71122681e-01  -1.34010736e-01   8.07705110e-02 

In [24]:
lsa_bow_sublin_sigma= readPickle('embeddings/examples_1/lsa/lsa_bow_sublin_sigma')
print("lsa_bow_sublin_sigma.shape: %s \n" %str(lsa_bow_sublin_sigma.shape))
print(lsa_bow_sublin_sigma)

lsa_bow_sublin_sigma.shape: (4,) 

[ 3.40841488  2.63401099  2.48644543  2.2351237 ]


In [25]:
lsa_bow_sublin_vt= readPickle('embeddings/examples_1/lsa/lsa_bow_sublin_vt')
print("lsa_bow_sublin_vt.shape: %s \n" %str(lsa_bow_sublin_vt.shape))
print(lsa_bow_sublin_vt)
lsa_bow_sublin_v = np.transpose(lsa_bow_sublin_vt)
print("\nlsa_bow_sublin_v.shape: %s \n" %str(lsa_bow_sublin_v.shape))
print(lsa_bow_sublin_v)

lsa_bow_sublin_vt.shape: (4, 4) 

[[-0.27132156 -0.73878879 -0.27915176 -0.55013637]
 [-0.65580347 -0.03621542  0.75398909 -0.01052142]
 [ 0.67969736 -0.46296686  0.56891867 -0.00217485]
 [-0.18525095 -0.48840685 -0.17293437  0.83500571]]

lsa_bow_sublin_v.shape: (4, 4) 

[[-0.27132156 -0.65580347  0.67969736 -0.18525095]
 [-0.73878879 -0.03621542 -0.46296686 -0.48840685]
 [-0.27915176  0.75398909  0.56891867 -0.17293437]
 [-0.55013637 -0.01052142 -0.00217485  0.83500571]]


In [26]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_bow_sublin_diag_sigma = readPickle('embeddings/examples_1/lsa/lsa_bow_sublin_diag_sigma')
wv_bow_sublin = dot(lsa_bow_sublin_u, lsa_bow_sublin_diag_sigma)
print("wv_bow_sublin.shape: %s \n" %str(wv_bow_sublin.shape))
print(wv_bow_sublin)

wv_bow_sublin.shape: (24, 4) 

[[  9.05108010e-18   1.50468407e-18  -8.05195561e-18  -4.81261933e-17]
 [ -1.80101453e+00  -7.18394514e-02  -7.86045885e-01   8.06103637e-03]
 [ -2.79151756e-01   7.53989094e-01   5.68918666e-01  -1.72934366e-01]
 [ -2.79151756e-01   7.53989094e-01   5.68918666e-01  -1.72934366e-01]
 [ -2.79151756e-01   7.53989094e-01   5.68918666e-01  -1.72934366e-01]
 [ -5.50136373e-01  -1.05214234e-02  -2.17484936e-03   8.35005713e-01]
 [ -7.38788793e-01  -3.62154151e-02  -4.62966861e-01  -4.88406847e-01]
 [ -7.38788793e-01  -3.62154151e-02  -4.62966861e-01  -4.88406847e-01]
 [ -7.38788793e-01  -3.62154151e-02  -4.62966861e-01  -4.88406847e-01]
 [ -1.01011035e+00  -6.92018884e-01   2.16730503e-01  -6.73657801e-01]
 [ -2.71321558e-01  -6.55803469e-01   6.79697364e-01  -1.85250953e-01]
 [ -5.50136373e-01  -1.05214234e-02  -2.17484936e-03   8.35005713e-01]
 [ -5.50136373e-01  -1.05214234e-02  -2.17484936e-03   8.35005713e-01]
 [ -2.79151756e-01   7.53989094e-01   5.689186

In [27]:
# words: dogs, pet, cat, I, cats, pets, hat, ponies 
w=[4,6,9,11,13,14,19,20]
wv_bow_sublin_ = []
for i in range(len(wv_bow_sublin)):
    if i in w:
        wv_bow_sublin_.append(wv_bow_sublin[i])
wv_bow_sublin_

[array([-0.27915176,  0.75398909,  0.56891867, -0.17293437]),
 array([-0.73878879, -0.03621542, -0.46296686, -0.48840685]),
 array([-1.01011035, -0.69201888,  0.2167305 , -0.6736578 ]),
 array([-0.55013637, -0.01052142, -0.00217485,  0.83500571]),
 array([-0.27915176,  0.75398909,  0.56891867, -0.17293437]),
 array([-0.27915176,  0.75398909,  0.56891867, -0.17293437]),
 array([-0.82145793, -0.66632489,  0.67752251,  0.64975476]),
 array([-0.73878879, -0.03621542, -0.46296686, -0.48840685])]

In [28]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_bow_sublin_sigma_vt = linalg.diagsvd(lsa_bow_sublin_sigma, len(lsa_bow_sublin_sigma), len(lsa_bow_sublin_vt))
dv_bow_sublin = dot(lsa_bow_sublin_v, lsa_bow_sublin_sigma_vt)
print(dv_bow_sublin)

[[-0.92477644 -1.72739355  1.69003041 -0.4140588 ]
 [-2.51809871 -0.0953918  -1.15114184 -1.09164972]
 [-0.951465    1.98601556  1.41458522 -0.3865297 ]
 [-1.875093   -0.02771354 -0.00540764  1.86634106]]


In [29]:
# word similarity as cosine distance pair
bow_sublin_ut = np.transpose(lsa_bow_sublin_u) 
wsim_bow_sublin = dot(lsa_bow_sublin_u,bow_sublin_ut)
# cat and cats
print("cat and cats: %s" %(wsim_bow_sublin[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_bow_sublin[6,14]))

cat and cats: 5.55111512313e-17
pet and pets: 1.90819582357e-17


In [30]:
# document similarity as cosine distance pair
dsim_bow_sublin = dot(lsa_bow_sublin_v,lsa_bow_sublin_vt)
print(dsim_bow_sublin)

[[  1.00000000e+00   6.93889390e-17   2.91433544e-16  -1.11022302e-16]
 [  6.93889390e-17   1.00000000e+00   2.22044605e-16  -4.44089210e-16]
 [  2.91433544e-16   2.22044605e-16   1.00000000e+00  -3.05311332e-16]
 [ -1.11022302e-16  -4.44089210e-16  -3.05311332e-16   1.00000000e+00]]


In [31]:
lsa_tfidf_u= readPickle('embeddings/examples_1/lsa/lsa_tfidf_u')
print("lsa_tfidf_u.shape: %s \n" %str(lsa_tfidf_u.shape))
print(lsa_tfidf_u)

lsa_tfidf_u.shape: (24, 24) 

[[ -3.94220184e-18  -1.82588637e-18   7.88858329e-18   5.56797392e-18
   -6.77626358e-21  -2.16840434e-18   3.03576608e-18   3.03576608e-18
    3.03576608e-18  -2.16649661e-01  -3.88614172e-01  -2.16840434e-18
   -2.16840434e-18  -6.77626358e-21  -6.77626358e-21  -3.88614172e-01
   -2.16840434e-18  -7.77228344e-01  -1.08420217e-18  -2.16649661e-01
    2.16840434e-18  -6.77626358e-21   0.00000000e+00   0.00000000e+00]
 [ -2.09397372e-01  -9.24610811e-02   4.57194413e-01  -3.51410306e-03
    1.94875644e-03  -9.17297111e-02  -4.00529300e-01  -4.00529300e-01
   -4.00529300e-01  -2.10411276e-01   2.31051985e-02  -9.17297111e-02
   -9.17297111e-02   1.94875644e-03   1.94875644e-03   2.31051985e-02
   -9.17297111e-02   4.62103970e-02  -1.19404865e-01  -3.82576820e-02
   -4.00529300e-01   1.94875644e-03   0.00000000e+00   0.00000000e+00]
 [ -1.79700535e-02  -3.94757769e-01  -9.41309391e-02  -7.17375552e-03
   -4.05118053e-01  -1.87283498e-01   4.80596715e-02   4.8

In [32]:
lsa_tfidf_sigma= readPickle('embeddings/examples_1/lsa/lsa_tfidf_sigma')
print("lsa_tfidf_sigma.shape: %s \n" %str(lsa_tfidf_sigma.shape))
print(lsa_tfidf_sigma)

lsa_tfidf_sigma.shape: (4,) 

[ 2.39550578  2.25832747  2.19756773  1.90311423]


In [33]:
lsa_tfidf_vt= readPickle('embeddings/examples_1/lsa/lsa_tfidf_vt')
print("lsa_tfidf_vt.shape: %s \n" %str(lsa_tfidf_vt.shape))
print(lsa_tfidf_vt)
lsa_tfidf_v = np.transpose(lsa_tfidf_vt)
print("\nlsa_tfidf_v.shape: %s \n" %str(lsa_tfidf_v.shape))
print(lsa_tfidf_v)

lsa_tfidf_vt.shape: (4, 4) 

[[-0.89542005 -0.36762642 -0.04698003 -0.24671164]
 [ 0.14115671 -0.16026622 -0.97293608 -0.08823211]
 [-0.41810275  0.79700175 -0.22575707  0.3728432 ]
 [-0.05905774 -0.45161241 -0.01489972  0.8901328 ]]

lsa_tfidf_v.shape: (4, 4) 

[[-0.89542005  0.14115671 -0.41810275 -0.05905774]
 [-0.36762642 -0.16026622  0.79700175 -0.45161241]
 [-0.04698003 -0.97293608 -0.22575707 -0.01489972]
 [-0.24671164 -0.08823211  0.3728432   0.8901328 ]]


In [34]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_tfidf_diag_sigma = readPickle('embeddings/examples_1/lsa/lsa_tfidf_diag_sigma')
wv_tfidf = dot(lsa_tfidf_u, lsa_tfidf_diag_sigma)
print("wv_tfidf.shape: %s \n" %str(wv_tfidf.shape))
print(wv_tfidf)

wv_tfidf.shape: (24, 4) 

[[ -9.44356728e-18  -4.12344934e-18   1.73356961e-17   1.05964904e-17]
 [ -5.01612615e-01  -2.08807399e-01   1.00471569e+00  -6.68773954e-03]
 [ -4.30473670e-02  -8.91492314e-01  -2.06859115e-01  -1.36524762e-02]
 [ -4.30473670e-02  -8.91492314e-01  -2.06859115e-01  -1.36524762e-02]
 [ -4.30473670e-02  -8.91492314e-01  -2.06859115e-01  -1.36524762e-02]
 [ -2.26059592e-01  -8.08462608e-02   3.41632767e-01   8.15620439e-01]
 [ -3.36852678e-01  -1.46850448e-01   7.30285315e-01  -4.13808268e-01]
 [ -3.36852678e-01  -1.46850448e-01   7.30285315e-01  -4.13808268e-01]
 [ -3.36852678e-01  -1.46850448e-01   7.30285315e-01  -4.13808268e-01]
 [ -6.45196497e-01  -9.76162408e-03   1.93551320e-01  -2.60863399e-01]
 [ -8.20465090e-01   1.29340587e-01  -3.83103671e-01  -5.41140598e-02]
 [ -2.26059592e-01  -8.08462608e-02   3.41632767e-01   8.15620439e-01]
 [ -2.26059592e-01  -8.08462608e-02   3.41632767e-01   8.15620439e-01]
 [ -4.30473670e-02  -8.91492314e-01  -2.06859115e-0

In [35]:
# words: dogs, pet, cat, I, cats, pets, hat, ponies 
w=[4,6,9,11,13,14,19,20]
wv_tfidf_ = []
for i in range(len(wv_tfidf)):
    if i in w:
        wv_tfidf_.append(wv_tfidf[i])
wv_tfidf_

[array([-0.04304737, -0.89149231, -0.20685911, -0.01365248]),
 array([-0.33685268, -0.14685045,  0.73028532, -0.41380827]),
 array([-0.6451965 , -0.00976162,  0.19355132, -0.2608634 ]),
 array([-0.22605959, -0.08084626,  0.34163277,  0.81562044]),
 array([-0.04304737, -0.89149231, -0.20685911, -0.01365248]),
 array([-0.04304737, -0.89149231, -0.20685911, -0.01365248]),
 array([-0.58343013,  0.02703524, -0.02311974,  0.42453444]),
 array([-0.33685268, -0.14685045,  0.73028532, -0.41380827])]

In [36]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_tfidf_sigma_vt = linalg.diagsvd(lsa_tfidf_sigma, len(lsa_tfidf_sigma), len(lsa_tfidf_vt))
dv_tfidf = dot(lsa_tfidf_v, lsa_tfidf_sigma_vt)
print(dv_tfidf)

[[-2.14498389  0.31877808 -0.9188091  -0.11239363]
 [-0.8806512  -0.3619336   1.75146533 -0.85947001]
 [-0.11254094 -2.19720828 -0.49611646 -0.02835587]
 [-0.59099917 -0.19925699  0.81934818  1.69402441]]


In [37]:
# word similarity as cosine distance pair
lsa_tfidf_ut = np.transpose(lsa_tfidf_u) 
wsim_tfidf = dot(lsa_tfidf_u,lsa_tfidf_ut)
# cat and cats
print("cat and cats: %s" %(wsim_tfidf[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_tfidf[6,14]))

cat and cats: 1.04083408559e-17
pet and pets: 1.73472347598e-17


In [38]:
print("%s \n"%wv_tfidf[6])
print("%s \n"%wv_tfidf[14])
sim = float(dot(wv_tfidf[6],wv_tfidf[14]) / (norm(wv_tfidf[6]) * norm(wv_tfidf[14])))
sim

[-0.33685268 -0.14685045  0.73028532 -0.41380827] 

[-0.04304737 -0.89149231 -0.20685911 -0.01365248] 



-4.3833556013462656e-15

In [39]:
# document similarity as cosine distance pair
dsim_tfidf = dot(lsa_tfidf_v,lsa_tfidf_vt)
print(dsim_tfidf)

[[  1.00000000e+00  -5.89805982e-17   5.69206141e-17   6.24500451e-17]
 [ -5.89805982e-17   1.00000000e+00   1.73472348e-17  -5.55111512e-17]
 [  5.69206141e-17   1.73472348e-17   1.00000000e+00  -1.50920942e-16]
 [  6.24500451e-17  -5.55111512e-17  -1.50920942e-16   1.00000000e+00]]
