In [None]:
# Models from Introduction to Algorithmic Marketing
# https://algorithmicweb.wordpress.com/
#
# Latent Semantic Analysis (LSA) is a method of text analysis 
# that helps to identify concepts represented in the text as related words
# In this example, we both calculate document representation in the 
# concept space and score documents againt the query using a distance metric in 
# this space

In [66]:
%matplotlib inline
import sympy as sy
import numpy as np
import matplotlib.pyplot as plt
from itertools import chain 
from collections import Counter

In [98]:
docs = [
"chicago chocolate retro candies made with love",
"chocolate sweets and candies collection with mini love hearts",
"retro sweets from chicago for chocolate lovers"]

In [99]:
# Basic analyzer: 
# - split documents into words
# - remove stop words
# - apply a simple stemmer
analyzer = {
    "with": None,
    "for": None,
    "and": None,
    "from": None,
    "lovers": "love",
    "hearts": "heart"
}
bag_of_words_docs = [list(filter(None, [analyzer.get(word, word) for word in d.split()])) for d in docs]

In [153]:
# Create term frequency matrix
unique_words = list(set(chain.from_iterable(bag_of_words)))
word_freq = [Counter(d) for d in bag_of_words_docs]
A = np.array([[freq.get(word, 0) for freq in word_freq] for word in unique_words])
for i, word in enumerate(unique_words):
    print("%10s %s" % (word, str(A[i])))  

     heart [0 1 0]
   candies [1 1 0]
collection [0 1 0]
      mini [0 1 0]
      love [1 1 1]
      made [1 0 0]
    sweets [0 1 1]
     retro [1 0 1]
   chicago [1 0 1]
 chocolate [1 1 1]


In [154]:
# Perform truncated SVD decomposition 
U, s, V = np.linalg.svd(A, full_matrices=False)
truncate_rank = 2
Ut = U[:, 0:truncate_rank]
Vt = V[0:truncate_rank, :]
St = np.diag(s[0:truncate_rank])
reconstruction = np.dot(Ut, np.dot(St, Vt))
print(Ut)
print(St)
print(Vt)
print(np.round(reconstruction))

[[-0.16776201 -0.40589961]
 [-0.33388781 -0.14894709]
 [-0.16776201 -0.40589961]
 [-0.16776201 -0.40589961]
 [-0.4857994   0.0183087 ]
 [-0.1661258   0.25695253]
 [-0.31967359 -0.23864383]
 [-0.31803739  0.42420831]
 [-0.31803739  0.42420831]
 [-0.4857994   0.0183087 ]]
[[ 3.56192303  0.        ]
 [ 0.          1.96587909]]
[[-0.59172732 -0.59755537 -0.54109736]
 [ 0.5051376  -0.79794957  0.32880465]]
[[-0.  1.  0.]
 [ 1.  1.  1.]
 [-0.  1.  0.]
 [-0.  1.  0.]
 [ 1.  1.  1.]
 [ 1. -0.  0.]
 [ 0.  1.  0.]
 [ 1.  0.  1.]
 [ 1.  0.  1.]
 [ 1.  1.  1.]]


In [158]:
# Project a query to the concept space and score documents
query = "chicago"
q = [int(query == word) for word in unique_words]
qs = np.dot(q, np.dot(Ut, np.linalg.inv(St)))

def score(query_vec, doc_vec):
    return np.dot(query_vec, doc_vec) / ( np.linalg.norm(query_vec) * np.linalg.norm(doc_vec) )

for d in range(len(docs)):
    print("Document %s score: %s" % (d, score(qs, Vt[:, d])))

Document 0 score: 0.890730150933
Document 1 score: -0.51043666768
Document 2 score: 0.806592806364
