# Jonathan Halverson
# Tuesday, February 7, 2017
# Gensim: Corpora and vector spaces

In [26]:
>>> import logging
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [27]:
>>> from gensim import corpora
>>>
>>> documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]

In [28]:
>>> # remove common words and tokenize
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

In [29]:
>>> # remove words that appear only once in the corpus
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1

>>> texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [30]:
>>> from pprint import pprint  # pretty-printer
>>> pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [31]:
# alternative method to remove words that appear only once in the corpus
from collections import Counter
c = Counter(reduce(lambda u, v: u + v, texts))
texts = [[token for token in text if c[token] > 1] for text in texts]

In [32]:
>>> from pprint import pprint  # pretty-printer
>>> pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [36]:
>>> dictionary = corpora.Dictionary(texts)
>>> print(dictionary)

2017-02-07 17:08:08,153 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-07 17:08:08,155 : INFO : built Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...) from 9 documents (total 29 corpus positions)


Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


In [38]:
>>> print(dictionary.token2id)

{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}


In [39]:
>>> new_doc = "Human computer interaction"
>>> new_vec = dictionary.doc2bow(new_doc.lower().split())
>>> print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(1, 1), (2, 1)]


In [43]:
>>> corpus = [dictionary.doc2bow(text, return_missing=False) for text in texts]
>>> pprint(corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (6, 2), (8, 1)],
 [(3, 1), (4, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(5, 1), (10, 1), (11, 1)]]


One can write a class which overrides the "__iter__" function to generate one document at a time.
