In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Similarity Interface
Determine similarity between a specific document and a set of other documents.

In [2]:
from gensim import corpora, models, similarities
import os
os.chdir("tmp") # run 'DemoGensim_CorporaAndVectorSpaces' first before reading the dict
dictionary = corpora.Dictionary.load('deerwester.dict')
corpus = corpora.MmCorpus('deerwester.mm') 
print(corpus)

MmCorpus(9 documents, 12 features, 28 non-zero entries)


First, use this tiny corpus to define a 2-D LSI space:

In [3]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

Sort the nine corpus documents in decreasing order of relevance to this query.

In [4]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi)

[(0, 0.4618210045327158), (1, -0.070027665279000062)]


### Initializing Query Structures
Prepare for similarity queries.

In [5]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform coprpus to LSI space and index it



Index persistency is handled via the standard save() and load()

In [6]:
index.save("deerwester.index")
index = similarities.MatrixSimilarity.load('deerwester.index')
for term in map(list, index):
    print(term)

[1.0, 0.91421592, 0.999982, 0.99478287, 0.87990767, -0.18518142, -0.16756734, -0.16003224, -0.011704311]
[0.91421592, 1.0, 0.91662991, 0.86810708, 0.99696732, 0.22892299, 0.24630509, 0.25370079, 0.39449945]
[0.999982, 0.91662991, 1.0, 0.99415314, 0.88274169, -0.17928416, -0.16165146, -0.154109, -0.0057068467]
[0.99478287, 0.86810708, 0.99415314, 1.0, 0.82684523, -0.28446588, -0.26726568, -0.25989753, -0.11365126]
[0.87990767, 0.99696732, 0.88274169, 0.82684523, 1.0, 0.30398417, 0.32098264, 0.32820731, 0.46481341]
[-0.18518142, 0.22892299, -0.17928416, -0.28446588, 0.30398417, 0.99999994, 0.99983984, 0.99967402, 0.98480445]
[-0.16756734, 0.24630509, -0.16165146, -0.26726568, 0.32098264, 0.99983984, 1.0, 0.99997079, 0.9877544]
[-0.16003224, 0.25370079, -0.154109, -0.25989753, 0.32820731, 0.99967402, 0.99997079, 1.0, 0.98891723]
[-0.011704311, 0.39449945, -0.0057068467, -0.11365126, 0.46481341, 0.98480445, 0.9877544, 0.98891723, 1.0]


### Performing Queries
To obtain similarities of the query document against the nine index documents:

In [7]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

[(0, 0.99809301), (1, 0.93748635), (2, 0.99844527), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.098794639), (8, 0.050041765)]


Sort these similarities into descending order.

In [8]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
from pprint import pprint
pprint(sims) # print sorted(document number, similarity score) 2-tuples

[(2, 0.99844527),
 (0, 0.99809301),
 (3, 0.9865886),
 (1, 0.93748635),
 (4, 0.90755945),
 (8, 0.050041765),
 (7, -0.098794639),
 (6, -0.10639259),
 (5, -0.12416792)]
