In [1]:
from gensim import corpora, models, similarities

In [2]:
import logging


In [3]:
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
 documents = ["Shipment of gold damaged in a fire",
"Delivery of silver arrived in a silver truck",
"Shipment of gold arrived in a truck"]

In [5]:
texts = [[word for word in document.lower().split()] for document in 
documents] 

In [6]:
 print texts

[['shipment', 'of', 'gold', 'damaged', 'in', 'a', 'fire'], ['delivery', 'of', 'silver', 'arrived', 'in', 'a', 'silver', 'truck'], ['shipment', 'of', 'gold', 'arrived', 'in', 'a', 'truck']]


In [7]:
dictionary = corpora.Dictionary(texts)
print dictionary

Dictionary(11 unique tokens: [u'a', u'damaged', u'gold', u'fire', u'of']...)


In [10]:
print dictionary.token2id

{u'a': 0, u'damaged': 1, u'gold': 2, u'fire': 3, u'of': 4, u'delivery': 7, u'truck': 8, u'shipment': 5, u'in': 6, u'arrived': 9, u'silver': 10}


In [12]:
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2)], [(0, 1), (2, 1), (4, 1), (5, 1), (6, 1), (8, 1), (9, 1)]]


In [13]:
tfidf = models.TfidfModel(corpus)

In [14]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print doc

[(1, 0.6633689723434505), (2, 0.2448297500958463), (3, 0.6633689723434505), (5, 0.2448297500958463)]
[(7, 0.4355066251613605), (8, 0.16073253746956623), (9, 0.16073253746956623), (10, 0.871013250322721)]
[(2, 0.5), (5, 0.5), (8, 0.5), (9, 0.5)]


In [15]:
print tfidf

TfidfModel(num_docs=3, num_nnz=21)


In [16]:
print tfidf.dfs

{0: 3, 1: 1, 2: 2, 3: 1, 4: 3, 5: 2, 6: 3, 7: 1, 8: 2, 9: 2, 10: 1}


In [17]:
print tfidf.idfs

{0: 0.0, 1: 1.5849625007211563, 2: 0.5849625007211562, 3: 1.5849625007211563, 4: 0.0, 5: 0.5849625007211562, 6: 0.0, 7: 1.5849625007211563, 8: 0.5849625007211562, 9: 0.5849625007211562, 10: 1.5849625007211563}


In [18]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics = 2 )

In [19]:
lsi.print_topics(2)

[u'-0.438*"gold" + -0.438*"shipment" + -0.366*"truck" + -0.366*"arrived" + -0.345*"damaged" + -0.345*"fire" + -0.297*"silver" + -0.149*"delivery" + 0.000*"a" + 0.000*"of"',
 u'-0.728*"silver" + -0.364*"delivery" + 0.364*"damaged" + 0.364*"fire" + 0.134*"gold" + 0.134*"shipment" + -0.134*"arrived" + -0.134*"truck" + -0.000*"a" + 0.000*"in"']

In [20]:
corpus_lsi = lsi[corpus_tfidf]
for doc in corpus_lsi:
    print doc


[(0, -0.67211468809878616), (1, 0.54880682119355884)]
[(0, -0.44124825208697815), (1, -0.83594920480339074)]
[(0, -0.80401378963792713)]


In [28]:
lda = models.LdaModel(corpus_tfidf, id2word= dictionary, num_topics = 2)
lda.print_topics(2)



[u'0.132*silver + 0.105*truck + 0.105*shipment + 0.100*fire + 0.099*arrived + 0.099*gold + 0.095*damaged + 0.092*delivery + 0.058*a + 0.058*of',
 u'0.114*gold + 0.108*damaged + 0.108*shipment + 0.103*arrived + 0.103*fire + 0.097*truck + 0.094*silver + 0.082*delivery + 0.064*a + 0.064*of']

In [21]:
index = similarities.MatrixSimilarity(lsi[corpus])



In [22]:
query = 'gold silver truck'
query_bow = dictionary.doc2bow(query.lower().split())
print query_bow

[(2, 1), (8, 1), (10, 1)]


In [23]:
query_lsi = lsi[query_bow]
print query_lsi

[(0, -1.101283574862848), (1, -0.72812283398049538)]


In [24]:
sims = index[query_lsi]
print list(enumerate(sims))


[(0, 0.40757114), (1, 0.93163693), (2, 0.83416492)]


In [25]:
sort_sims = sorted(enumerate(sims), key= lambda item: -item[1] )
print sort_sims

[(1, 0.93163693), (2, 0.83416492), (0, 0.40757114)]
