In [1]:
import numpy as np
from math import *
from scipy import linalg,dot
from numpy.linalg import norm

In [2]:
from scripts.text_preprocessing import *

In [3]:
from scripts.lsa.vector_space import VectorSpace
from scripts.lsa.tfidf import TFIDF
from scripts.lsa.lsa import LSA

In [4]:
'''
Original documents:
["Machine learning is super fun", 
"Python is super, super cool", 
"Statistics is cool, too", 
"Data science is fun", 
"Python is great for machine learning", 
"I like football", 
"Football is great to watch"]
'''
tokenized_docs = readPickle('embeddings/examples_2/lsa/tokenized_docs')

In [5]:
tokenized_docs

[['machine', 'learning', 'is', 'super', 'fun'],
 ['python', 'is', 'super', ',', 'super', 'cool'],
 ['statistics', 'is', 'cool', ',', 'too'],
 ['data', 'science', 'is', 'fun'],
 ['python', 'is', 'great', 'for', 'machine', 'learning'],
 ['i', 'like', 'football'],
 ['football', 'is', 'great', 'to', 'watch']]

In [6]:
vocab = readPickle('embeddings/examples_2/lsa/vocab')

In [7]:
vocab

{0: 'zerostart',
 1: 'great',
 2: 'statistics',
 3: 'like',
 4: 'for',
 5: 'to',
 6: 'python',
 7: 'data',
 8: 'is',
 9: 'football',
 10: 'watch',
 11: ',',
 12: 'machine',
 13: 'i',
 14: 'too',
 15: 'learning',
 16: 'science',
 17: 'fun',
 18: 'super',
 19: 'cool',
 20: 'EOF',
 21: 'UNK'}

In [8]:
td_bow = readPickle('embeddings/examples_2/lsa/td_bow')

In [9]:
td_bow

[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [10]:
td_bow_sublin = readPickle('embeddings/examples_2/lsa/td_bow_sublin')

In [11]:
print(td_bow_sublin)

[[0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 1.0, 0, 0, 1.0, 0, 1.0, 1.0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 1.6931471805599454, 1.0, 0, 0], [0, 0, 1.0, 0, 0, 0, 0, 0, 1.0, 0, 0, 1.0, 0, 0, 1.0, 0, 0, 0, 0, 1.0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0], [0, 1.0, 0, 0, 1.0, 0, 1.0, 0, 1.0, 0, 0, 0, 1.0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1.0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1.0, 0, 0, 0, 1.0, 0, 0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [12]:
td_tfidf = readPickle('embeddings/examples_2/lsa/td_tfidf')

In [13]:
print(td_tfidf)

[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.13353139  0.          0.          0.          0.98082925
   0.          0.          0.98082925  0.          0.98082925  0.98082925
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.98082925  0.          0.13353139  0.          0.          0.98082925
   0.          0.          0.          0.          0.          0.
   1.96165851  0.98082925  0.          0.        ]
 [ 0.          0.          1.38629436  0.          0.          0.          0.
   0.          0.13353139  0.          0.          0.98082925  0.          0.
   1.38629436  0.          0.          0.          0.          0.98082925
   0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   1.38629436  0.13353139  0.          0.          0.          0.          0.
   0.          0.          1.38629436  0.98082925  0.          0.

In [14]:
lsa_bow_u= readPickle('embeddings/examples_2/lsa/lsa_bow_u')
print("lsa_bow_u.shape: %s \n" %str(lsa_bow_u.shape))
#print(lsa_bow_u)

lsa_bow_u.shape: (22, 22) 



In [15]:
lsa_bow_sigma= readPickle('embeddings/examples_2/lsa/lsa_bow_sigma')
print("lsa_bow_sigma.shape: %s \n" %str(lsa_bow_sigma.shape))
print(lsa_bow_sigma)

lsa_bow_sigma.shape: (7,) 

[ 3.78755781  2.48535744  2.16269002  1.94623096  1.78535076  1.52662673
  1.22241345]


In [16]:
lsa_bow_vt= readPickle('embeddings/examples_2/lsa/lsa_bow_vt')
bow_v = np.transpose(lsa_bow_vt)
print("bow_v.shape: %s \n" %str(bow_v.shape))
print(bow_v)

bow_v.shape: (7, 7) 

[[-0.450051    0.17118042 -0.43089551  0.16338565  0.23415854  0.15645152
  -0.69027404]
 [-0.59418705 -0.57455888 -0.00951096 -0.26683761  0.35586836 -0.23088785
   0.25608511]
 [-0.3412854  -0.36314658  0.41558187  0.22499578 -0.60943066  0.33991104
  -0.20340157]
 [-0.24612877  0.16673819 -0.07776513  0.82420387  0.09386756 -0.07585185
   0.4601032 ]
 [-0.43955804  0.49824628 -0.24774658 -0.40556556 -0.38965146  0.19106058
   0.37992787]
 [-0.02390495  0.1448182   0.38802233 -0.06885837  0.52607515  0.72654771
   0.13611321]
 [-0.27121585  0.46008764  0.65080196 -0.05424766  0.09862716 -0.4863589
  -0.20494639]]


In [17]:
svd_bow= readPickle('embeddings/examples_2/lsa/svd_bow')
svd_bow_t = np.transpose(svd_bow)
print("svd_bow_t.shape: %s \n" %str(svd_bow_t.shape))
#print(svd_bow_t)

svd_bow_t.shape: (7, 22) 



In [18]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_bow_diag_sigma = readPickle('embeddings/examples_2/lsa/lsa_bow_diag_sigma')
wv_bow = dot(lsa_bow_u, lsa_bow_diag_sigma)
print("wv_bow.shape: %s \n" %str(wv_bow.shape))
#print(wv_bow)

wv_bow.shape: (22, 7) 



In [19]:
'''
words: 2: 'statistics', 6: 'python', 7: 'data', 9: 'football', 10: 'watch',
12: 'machine', 15: 'learning', 16: 'science', 17: 'fun', 19: 'cool'
'''
  
w=[2,6,7,9,10,12,15,16,17,19]
wv_bow_ = []
for i in range(len(wv_bow)):
    if i in w:
        wv_bow_.append(wv_bow[i])
wv_bow_

[array([-0.3412854 , -0.36314658,  0.41558187,  0.22499578, -0.60943066,
         0.33991104, -0.20340157]),
 array([-1.03374509, -0.0763126 , -0.25725754, -0.67240317, -0.03378309,
        -0.03982727,  0.63601298]),
 array([-0.24612877,  0.16673819, -0.07776513,  0.82420387,  0.09386756,
        -0.07585185,  0.4601032 ]),
 array([-0.2951208 ,  0.60490584,  1.03882429, -0.12310603,  0.62470231,
         0.2401888 , -0.06883318]),
 array([-0.27121585,  0.46008764,  0.65080196, -0.05424766,  0.09862716,
        -0.4863589 , -0.20494639]),
 array([-0.88960904,  0.6694267 , -0.6786421 , -0.24217991, -0.15549292,
         0.3475121 , -0.31034617]),
 array([-0.88960904,  0.6694267 , -0.6786421 , -0.24217991, -0.15549292,
         0.3475121 , -0.31034617]),
 array([-0.24612877,  0.16673819, -0.07776513,  0.82420387,  0.09386756,
        -0.07585185,  0.4601032 ]),
 array([-0.69617977,  0.33791861, -0.50866064,  0.98758952,  0.3280261 ,
         0.08059967, -0.23017084]),
 array([-0.93547245

In [20]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_bow_sigma_vt = linalg.diagsvd(lsa_bow_sigma, len(lsa_bow_sigma), len(lsa_bow_vt))
dv_bow = dot(bow_v, lsa_bow_sigma_vt)
print(dv_bow)

[[-1.7045942   0.42544453 -0.93189343  0.31798621  0.41805512  0.23884307
  -0.84380027]
 [-2.25051781 -1.42798418 -0.02056925 -0.51932761  0.63534985 -0.35247956
   0.31304188]
 [-1.29263817 -0.90254905  0.89877477  0.43789376 -1.08804749  0.51891728
  -0.24864082]
 [-0.93222694  0.41440399 -0.16818187  1.60409108  0.16758652 -0.11579747
   0.56243634]
 [-1.66485147  1.23832009 -0.53579906 -0.78932425 -0.69566452  0.29167819
   0.46442894]
 [-0.09054138  0.35992498  0.83917202 -0.1340143   0.93922866  1.10916715
   0.16638662]
 [-1.0272457   1.14348224  1.40748291 -0.10557847  0.17608408 -0.7424885
  -0.25052922]]


In [21]:
# word similarity as cosine distance pair
bow_ut = np.transpose(lsa_bow_u) 
wsim_bow = dot(lsa_bow_u,bow_ut)
# cat and cats
print("cat and cats: %s" %(wsim_bow[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_bow[6,14]))


cat and cats: -6.93889390391e-17
pet and pets: 1.38777878078e-16


In [22]:
# document similarity as cosine distance pair
dsim_bow = dot(bow_v,lsa_bow_vt)
print(dsim_bow)

[[  1.00000000e+00  -1.94289029e-16   2.77555756e-17  -1.11022302e-16
   -2.77555756e-16   2.77555756e-17  -8.32667268e-17]
 [ -1.94289029e-16   1.00000000e+00  -1.24900090e-16  -2.08166817e-16
    1.38777878e-16   2.42861287e-16   2.42861287e-16]
 [  2.77555756e-17  -1.24900090e-16   1.00000000e+00   2.63677968e-16
    2.77555756e-17  -2.42861287e-17  -4.16333634e-17]
 [ -1.11022302e-16  -2.08166817e-16   2.63677968e-16   1.00000000e+00
   -2.77555756e-16  -2.49800181e-16  -4.16333634e-17]
 [ -2.77555756e-16   1.38777878e-16   2.77555756e-17  -2.77555756e-16
    1.00000000e+00   1.31838984e-16  -6.93889390e-17]
 [  2.77555756e-17   2.42861287e-16  -2.42861287e-17  -2.49800181e-16
    1.31838984e-16   1.00000000e+00  -7.32053307e-16]
 [ -8.32667268e-17   2.42861287e-16  -4.16333634e-17  -4.16333634e-17
   -6.93889390e-17  -7.32053307e-16   1.00000000e+00]]


In [23]:
lsa_bow_sublin_u= readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_u')
print("lsa_bow_sublin_u.shape: %s \n" %str(lsa_bow_sublin_u.shape))
#print(lsa_bow_sublin_u)

lsa_bow_sublin_u.shape: (22, 22) 



In [24]:
lsa_bow_sublin_sigma= readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_sigma')
print("lsa_bow_sublin_sigma.shape: %s \n" %str(lsa_bow_sublin_sigma.shape))
print(lsa_bow_sublin_sigma)

lsa_bow_sublin_sigma.shape: (7,) 

[ 3.71761946  2.42512682  2.16184114  1.93473228  1.71953552  1.50538111
  1.23494615]


In [25]:
lsa_bow_sublin_vt= readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_vt')
print("lsa_bow_sublin_vt.shape: %s \n" %str(lsa_bow_sublin_vt.shape))
print(lsa_bow_sublin_vt)
lsa_bow_sublin_v = np.transpose(lsa_bow_sublin_vt)
print("\nlsa_bow_sublin_v.shape: %s \n" %str(lsa_bow_sublin_v.shape))
print(lsa_bow_sublin_v)

lsa_bow_sublin_vt.shape: (7, 7) 

[[-0.45693118 -0.5428162  -0.35212092 -0.26147774 -0.46719828 -0.02697564
  -0.29189519]
 [ 0.17760369 -0.54181256 -0.48093786  0.12372076  0.47444865  0.14779846
   0.42584284]
 [-0.4324054  -0.01675527  0.3500877  -0.09745159 -0.2173718   0.4082388
   0.68321095]
 [ 0.17962248 -0.23246256  0.10612664  0.83060045 -0.46040339  0.01908938
   0.01418701]
 [ 0.26799889  0.40830423 -0.52865997 -0.0668377  -0.31947279  0.61217109
  -0.02644431]
 [-0.09906438  0.32966008 -0.42172989  0.03703535 -0.24766232 -0.64544847
   0.47364797]
 [ 0.67734408 -0.28649961  0.2314042  -0.45944505 -0.36606417 -0.13801244
   0.20355565]]

lsa_bow_sublin_v.shape: (7, 7) 

[[-0.45693118  0.17760369 -0.4324054   0.17962248  0.26799889 -0.09906438
   0.67734408]
 [-0.5428162  -0.54181256 -0.01675527 -0.23246256  0.40830423  0.32966008
  -0.28649961]
 [-0.35212092 -0.48093786  0.3500877   0.10612664 -0.52865997 -0.42172989
   0.2314042 ]
 [-0.26147774  0.12372076 -0.09745159  0.8

In [26]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_bow_sublin_diag_sigma = readPickle('embeddings/examples_2/lsa/lsa_bow_sublin_diag_sigma')
wv_bow_sublin = dot(lsa_bow_sublin_u, lsa_bow_sublin_diag_sigma)
print("wv_bow_sublin.shape: %s \n" %str(wv_bow_sublin.shape))
#print(wv_bow_sublin)

wv_bow_sublin.shape: (22, 7) 



In [27]:
'''
words: 2: 'statistics', 6: 'python', 7: 'data', 9: 'football', 10: 'watch',
12: 'machine', 15: 'learning', 16: 'science', 17: 'fun', 19: 'cool'
'''
  
w=[2,6,7,9,10,12,15,16,17,19]
wv_bow_sublin_ = []
for i in range(len(wv_bow_sublin)):
    if i in w:
        wv_bow_sublin_.append(wv_bow_sublin[i])
wv_bow_sublin_

[array([-0.35212092, -0.48093786,  0.3500877 ,  0.10612664, -0.52865997,
        -0.42172989,  0.2314042 ]),
 array([-1.01001448, -0.06736391, -0.23412707, -0.69286595,  0.08883144,
         0.08199776, -0.65256378]),
 array([-0.26147774,  0.12372076, -0.09745159,  0.83060045, -0.0668377 ,
         0.03703535, -0.45944505]),
 array([-0.31887083,  0.5736413 ,  1.09144975,  0.03327639,  0.58572678,
        -0.1718005 ,  0.06554321]),
 array([-0.29189519,  0.42584284,  0.68321095,  0.01418701, -0.02644431,
         0.47364797,  0.20355565]),
 array([-0.92412946,  0.65205234, -0.6497772 , -0.28078091, -0.05147391,
        -0.3467267 ,  0.31127991]),
 array([-0.92412946,  0.65205234, -0.6497772 , -0.28078091, -0.05147391,
        -0.3467267 ,  0.31127991]),
 array([-0.26147774,  0.12372076, -0.09745159,  0.83060045, -0.0668377 ,
         0.03703535, -0.45944505]),
 array([-0.71840892,  0.30132445, -0.52985699,  1.01022293,  0.20116118,
        -0.06202903,  0.21789903]),
 array([-0.89493712

In [28]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_bow_sublin_sigma_vt = linalg.diagsvd(lsa_bow_sublin_sigma, len(lsa_bow_sublin_sigma), len(lsa_bow_sublin_vt))
dv_bow_sublin = dot(lsa_bow_sublin_v, lsa_bow_sublin_sigma_vt)
print(dv_bow_sublin)

[[-1.69869626  0.43071147 -0.93479178  0.34752141  0.4608336  -0.14912965
   0.83648347]
 [-2.01798409 -1.31396417 -0.03622222 -0.44975282  0.70209363  0.49626406
  -0.35381159]
 [-1.30905157 -1.1663353   0.756834    0.20532664 -0.9090496  -0.63486422
   0.28577173]
 [-0.97207472  0.30003853 -0.21067486  1.6069895  -0.11492981  0.05575232
  -0.5673899 ]
 [-1.73686541  1.15059815 -0.4699233  -0.8907573  -0.54934481 -0.37282617
  -0.45206954]
 [-0.10028517  0.35843     0.88254743  0.03693283  1.05264994 -0.97164594
  -0.17043793]
 [-1.08515522  1.0327229   1.47699353  0.02744808 -0.04547193  0.71302071
   0.25138026]]


In [29]:
# word similarity as cosine distance pair
bow_sublin_ut = np.transpose(lsa_bow_sublin_u) 
wsim_bow_sublin = dot(lsa_bow_sublin_u,bow_sublin_ut)
# cat and cats
print("cat and cats: %s" %(wsim_bow_sublin[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_bow_sublin[6,14]))

cat and cats: 1.11022302463e-16
pet and pets: 8.32667268469e-17


In [30]:
# document similarity as cosine distance pair
dsim_bow_sublin = dot(lsa_bow_sublin_v,lsa_bow_sublin_vt)
print(dsim_bow_sublin)

[[  1.00000000e+00  -1.38777878e-16   8.32667268e-17   0.00000000e+00
   -8.32667268e-17  -6.93889390e-17   5.55111512e-17]
 [ -1.38777878e-16   1.00000000e+00   5.55111512e-17  -3.88578059e-16
   -5.55111512e-17   1.59594560e-16  -2.91433544e-16]
 [  8.32667268e-17   5.55111512e-17   1.00000000e+00   2.77555756e-16
   -4.44089210e-16  -1.24900090e-16  -2.08166817e-16]
 [  0.00000000e+00  -3.88578059e-16   2.77555756e-16   1.00000000e+00
   -6.66133815e-16   9.71445147e-17   2.49800181e-16]
 [ -8.32667268e-17  -5.55111512e-17  -4.44089210e-16  -6.66133815e-16
    1.00000000e+00  -1.31838984e-16   1.24900090e-16]
 [ -6.93889390e-17   1.59594560e-16  -1.24900090e-16   9.71445147e-17
   -1.31838984e-16   1.00000000e+00   3.08780779e-16]
 [  5.55111512e-17  -2.91433544e-16  -2.08166817e-16   2.49800181e-16
    1.24900090e-16   3.08780779e-16   1.00000000e+00]]


In [31]:
lsa_tfidf_u= readPickle('embeddings/examples_2/lsa/lsa_tfidf_u')
print("lsa_tfidf_u.shape: %s \n" %str(lsa_tfidf_u.shape))
#print(lsa_tfidf_u)

lsa_tfidf_u.shape: (22, 22) 



In [32]:
lsa_tfidf_sigma= readPickle('embeddings/examples_2/lsa/lsa_tfidf_sigma')
print("lsa_tfidf_sigma.shape: %s \n" %str(lsa_tfidf_sigma.shape))
print(lsa_tfidf_sigma)

lsa_tfidf_sigma.shape: (7,) 

[ 3.08454001  2.64481454  2.45079196  2.22750758  2.08499286  1.96085847
  1.39142926]


In [33]:
lsa_tfidf_vt= readPickle('embeddings/examples_2/lsa/lsa_tfidf_vt')
print("lsa_tfidf_vt.shape: %s \n" %str(lsa_tfidf_vt.shape))
print(lsa_tfidf_vt)
lsa_tfidf_v = np.transpose(lsa_tfidf_vt)
print("\nlsa_tfidf_v.shape: %s \n" %str(lsa_tfidf_v.shape))
print(lsa_tfidf_v)

lsa_tfidf_vt.shape: (7, 7) 

[[ -4.05730863e-01  -6.98854221e-01  -3.68985689e-01  -9.09319147e-02
   -4.30874157e-01  -2.60328182e-02  -1.27420350e-01]
 [  1.52268423e-01  -2.89938285e-01  -4.45886649e-01   7.17438916e-02
    4.97609709e-01   2.67198088e-01   6.08093569e-01]
 [  3.41139486e-01  -5.69665733e-02  -4.70933136e-01   2.70982900e-01
    3.05602477e-01  -4.38478710e-01  -5.47277313e-01]
 [ -1.29324075e-01   3.40553560e-02  -6.47208574e-02  -8.82412230e-01
    3.39242114e-01  -2.86407625e-01  -4.64848753e-02]
 [ -1.30003035e-01  -3.48164550e-01   4.62375706e-01   2.44245317e-01
    1.85515860e-01  -6.71926104e-01   3.20207378e-01]
 [ -3.33393022e-04   4.46430555e-01  -4.46750430e-01   3.17202261e-04
   -4.46596929e-01  -4.48455910e-01   4.47830415e-01]
 [ -8.13753666e-01   3.20335475e-01  -1.60282902e-01   2.73578599e-01
    3.52688249e-01   3.22051928e-02  -9.60631062e-02]]

lsa_tfidf_v.shape: (7, 7) 

[[ -4.05730863e-01   1.52268423e-01   3.41139486e-01  -1.29324075e-01
   

In [34]:
# words similarity w.r.t document context
# column is principal component or dimension (context) in which we can measure similarity of words in context (rows of matrix)
lsa_tfidf_diag_sigma = readPickle('embeddings/examples_2/lsa/lsa_tfidf_diag_sigma')
wv_tfidf = dot(lsa_tfidf_u, lsa_tfidf_diag_sigma)
print("wv_tfidf.shape: %s \n" %str(wv_tfidf.shape))
print(wv_tfidf)

wv_tfidf.shape: (22, 7) 

[[ -1.17621194e-16   5.39944796e-17   3.80128630e-17  -9.04971395e-17
    5.18059376e-17  -1.26249877e-17   9.07948352e-17]
 [ -5.47591585e-01   1.08450612e+00  -2.37041749e-01   2.87144863e-01
    4.96028145e-01   1.20983838e-03   2.51705448e-01]
 [ -5.11522781e-01  -6.18130147e-01  -6.52851951e-01  -8.97221596e-02
    6.40988834e-01  -6.19327602e-01  -2.22199283e-01]
 [ -3.60891491e-02   3.70415203e-01  -6.07860563e-01  -3.97045276e-01
   -9.31487370e-01  -6.21691899e-01   4.46458772e-02]
 [ -5.97318414e-01   6.89833533e-01   4.23654991e-01   4.70289429e-01
    2.57179590e-01  -6.19114805e-01   4.88929731e-01]
 [ -1.76642113e-01   8.42996685e-01  -7.58687453e-01  -6.44417205e-02
    4.43901682e-01   6.20824778e-01  -1.33171742e-01]
 [ -1.10807064e+00   2.03690208e-01   2.43869368e-01   3.66141078e-01
   -1.59530593e-01  -1.63184439e-04   6.60121357e-01]
 [ -1.26058401e-01   9.94581524e-02   3.75662066e-01  -1.22328310e+00
    3.38595905e-01   4.39735706e-04 

In [35]:
'''
words: 2: 'statistics', 6: 'python', 7: 'data', 9: 'football', 10: 'watch',
12: 'machine', 15: 'learning', 16: 'science', 17: 'fun', 19: 'cool'
'''
  
w=[2,6,7,9,10,12,15,16,17,19]
wv_tfidf_ = []
for i in range(len(wv_tfidf)):
    if i in w:
        wv_tfidf_.append(wv_tfidf[i])
wv_tfidf_

[array([-0.51152278, -0.61813015, -0.65285195, -0.08972216,  0.64098883,
        -0.6193276 , -0.22219928]),
 array([ -1.10807064e+00,   2.03690208e-01,   2.43869368e-01,
          3.66141078e-01,  -1.59530593e-01,  -1.63184439e-04,
          6.60121357e-01]),
 array([ -1.26058401e-01,   9.94581524e-02,   3.75662066e-01,
         -1.22328310e+00,   3.38595905e-01,   4.39735706e-04,
          3.79260469e-01]),
 array([ -1.50511357e-01,   8.58511662e-01,  -9.66858343e-01,
         -3.26510703e-01,  -3.44976016e-01,  -6.13504200e-04,
         -6.26337095e-02]),
 array([-0.17664211,  0.84299669, -0.75868745, -0.06444172,  0.44390168,
         0.62082478, -0.13317174]),
 array([-0.82056668,  0.63741948,  0.63434344,  0.20589375,  0.0544486 ,
        -0.43836233, -0.45222645]),
 array([-0.82056668,  0.63741948,  0.63434344,  0.20589375,  0.0544486 ,
        -0.43836233, -0.45222645]),
 array([ -1.26058401e-01,   9.94581524e-02,   3.75662066e-01,
         -1.22328310e+00,   3.38595905e-01,   

In [36]:
# document similarity w.r.t. grouped context (principal components)
# column is principal component or dimension (context) in which we can measure similarity of documents in context (rows of matrix)
lsa_tfidf_sigma_vt = linalg.diagsvd(lsa_tfidf_sigma, len(lsa_tfidf_sigma), len(lsa_tfidf_vt))
dv_tfidf = dot(lsa_tfidf_v, lsa_tfidf_sigma_vt)
print(dv_tfidf)

[[ -1.25149308e+00   4.02721738e-01   8.36061909e-01  -2.88070358e-01
   -2.71055399e-01  -6.53736529e-04  -1.13228066e+00]
 [ -2.15564381e+00  -7.66832990e-01  -1.39613220e-01   7.58585639e-02
   -7.25920600e-01   8.75387134e-01   4.45724153e-01]
 [ -1.13815112e+00  -1.17928749e+00  -1.15415914e+00  -1.44166201e-01
    9.64050044e-01  -8.76014363e-01  -2.23022320e-01]
 [ -2.80483129e-01   1.89749288e-01   6.64122711e-01  -1.96557994e+00
    5.09249741e-01   6.21988739e-04   3.80665268e-01]
 [ -1.32904858e+00   1.31608539e+00   7.48968093e-01   7.55664381e-01
    3.86799242e-01  -8.75713369e-01   4.90740750e-01]
 [ -8.02992695e-02   7.06689389e-01  -1.07462010e+00  -6.37975158e-01
   -1.40096113e+00  -8.79358567e-01   4.48112476e-02]
 [ -3.93033169e-01   1.60829471e+00  -1.34126284e+00  -1.03545412e-01
    6.67630095e-01   8.78132059e-01  -1.33665017e-01]]


In [37]:
# word similarity as cosine distance pair
lsa_tfidf_ut = np.transpose(lsa_tfidf_u) 
wsim_tfidf = dot(lsa_tfidf_u,lsa_tfidf_ut)
# cat and cats
print("cat and cats: %s" %(wsim_tfidf[4,9]))
# pet and pets
print("pet and pets: %s" %(wsim_tfidf[6,14]))

cat and cats: -1.38777878078e-17
pet and pets: 9.71445146547e-17


In [38]:
print("%s \n"%wv_tfidf[6])
print("%s \n"%wv_tfidf[14])
sim = float(dot(wv_tfidf[6],wv_tfidf[14]) / (norm(wv_tfidf[6]) * norm(wv_tfidf[14])))
sim

[ -1.10807064e+00   2.03690208e-01   2.43869368e-01   3.66141078e-01
  -1.59530593e-01  -1.63184439e-04   6.60121357e-01] 

[-0.51152278 -0.61813015 -0.65285195 -0.08972216  0.64098883 -0.6193276
 -0.22219928] 



-1.5877388040638392e-16

In [39]:
# document similarity as cosine distance pair
dsim_tfidf = dot(lsa_tfidf_v,lsa_tfidf_vt)
print(dsim_tfidf)

[[  1.00000000e+00  -1.11022302e-16  -2.49800181e-16   5.55111512e-17
    2.22044605e-16  -1.38777878e-17   2.77555756e-17]
 [ -1.11022302e-16   1.00000000e+00  -3.46944695e-17  -3.74700271e-16
   -3.46944695e-16  -1.09287579e-16   4.51028104e-17]
 [ -2.49800181e-16  -3.46944695e-17   1.00000000e+00   2.15105711e-16
    3.33066907e-16  -8.93382590e-17  -5.20417043e-17]
 [  5.55111512e-17  -3.74700271e-16   2.15105711e-16   1.00000000e+00
    2.63677968e-16   1.04083409e-17  -1.94289029e-16]
 [  2.22044605e-16  -3.46944695e-16   3.33066907e-16   2.63677968e-16
    1.00000000e+00   1.30104261e-16  -4.85722573e-17]
 [ -1.38777878e-17  -1.09287579e-16  -8.93382590e-17   1.04083409e-17
    1.30104261e-16   1.00000000e+00  -2.94902991e-17]
 [  2.77555756e-17   4.51028104e-17  -5.20417043e-17  -1.94289029e-16
   -4.85722573e-17  -2.94902991e-17   1.00000000e+00]]


In [42]:
w2v_1 = readPickle('embeddings/examples_2/w2v/w2v_embed1')
w2v_1.shape

(22, 7)

In [43]:
w2v_2 = readPickle('embeddings/examples_2/w2v/w2v_embed2')
w2v_2.shape

(22, 7)

In [46]:
avg_embed1 = readPickle('embeddings/examples_2/w2v/avg_embed1')
avg_embed1.shape

(7, 7)

In [47]:
avg_embed2 = readPickle('embeddings/examples_2/w2v/avg_embed2')
avg_embed2.shape

(7, 7)