Gensim uses Python’s standard logging module to log various stuff at various priority levels

In [1]:
import logging 

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [3]:
from gensim import corpora, models, similarities

2017-04-13 10:55:14,499 : INFO : 'pattern' package not found; tag filters are not available for English


In [16]:
#9 docs(9 rows) and 12 features ([0, to 11, ])
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
 [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
 [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
 [(0, 1.0), (4, 2.0), (7, 1.0)],
 [(3, 1.0), (5, 1.0), (6, 1.0)],
 [(9, 1.0)],
 [(9, 1.0), (10, 1.0)],
 [(9, 1.0), (10, 1.0), (11, 1.0)],
  [(8, 1.0), (10, 1.0), (11, 1.0)]] 

In [19]:
# implement the trasnformation
tfidf = models.TfidfModel(corpus)

2017-04-13 13:42:00,138 : INFO : collecting document frequencies
2017-04-13 13:42:00,140 : INFO : PROGRESS: processing document #0
2017-04-13 13:42:00,165 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [20]:
vec = [(0, 1), (4, 1)]
print(tfidf[vec])

[(0, 0.8075244024440723), (4, 0.5898341626740045)]


In [21]:
# To transform the whole corpus via TfIdf and index it, in preparation for similarity queries:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)

2017-04-13 13:42:37,761 : INFO : creating sparse index
2017-04-13 13:42:37,816 : INFO : creating sparse matrix from corpus
2017-04-13 13:42:37,851 : INFO : PROGRESS: at document #0
2017-04-13 13:42:38,485 : INFO : created <9x12 sparse matrix of type '<class 'numpy.float32'>'
	with 28 stored elements in Compressed Sparse Row format>


to query the similarity of our query vector vec against every document in the corpus

In [22]:
sims = index[tfidf[vec]]
print(list(enumerate(sims)))

[(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


# Corpora and Vector Spaces

In [23]:
documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]

In [31]:
>>> # remove common words and tokenize
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]
>>>
>>> # remove words that appear only once
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>
>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>
>>> from pprint import pprint  # pretty-printer
>>> pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [32]:
>>> dictionary = corpora.Dictionary(texts)
>>> dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
>>> print(dictionary)

2017-04-13 14:55:45,501 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-04-13 14:55:45,519 : INFO : built Dictionary(12 unique tokens: ['response', 'system', 'minors', 'computer', 'time']...) from 9 documents (total 29 corpus positions)
2017-04-13 14:55:45,543 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
2017-04-13 14:55:45,911 : INFO : saved /tmp/deerwester.dict


Dictionary(12 unique tokens: ['response', 'system', 'minors', 'computer', 'time']...)


In [33]:
print(dictionary.token2id)

{'response': 5, 'system': 7, 'minors': 11, 'computer': 0, 'time': 4, 'survey': 6, 'interface': 1, 'trees': 9, 'human': 2, 'user': 3, 'graph': 10, 'eps': 8}


In [34]:
>>> new_doc = "Human computer interaction"
>>> new_vec = dictionary.doc2bow(new_doc.lower().split())
>>> print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored
[(0, 1), (1, 1)]

[(0, 1), (2, 1)]


[(0, 1), (1, 1)]

In [37]:
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
>>> print(corpus)

2017-04-13 14:58:54,708 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2017-04-13 14:58:54,710 : INFO : saving sparse matrix to /tmp/deerwester.mm
2017-04-13 14:58:54,712 : INFO : PROGRESS: saving document #0
2017-04-13 14:58:54,713 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2017-04-13 14:58:54,715 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(1, 1), (3, 1), (7, 1), (8, 1)], [(2, 1), (7, 2), (8, 1)], [(3, 1), (4, 1), (5, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(6, 1), (10, 1), (11, 1)]]


### Corpus Streaming – One Document at a Time

In [40]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('mycorpus.txt'):
         # assume there's one document per line, tokens separated by whitespace
              yield dictionary.doc2bow(line.lower().split())

In [41]:
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x7f0f558f92e8>
