## Gemsim corpora usage

In [1]:
import gensim 
from gensim import corpora
import numpy as np

In [2]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

#### now do you usualy text process pipline: normalization, tokenization etc

In [3]:
def text_pre_process(doc):
    return gensim.utils.simple_preprocess(doc)

In [4]:
text = [text_pre_process(doc) for doc in documents]
text[:2]

[['human',
  'machine',
  'interface',
  'for',
  'lab',
  'abc',
  'computer',
  'applications'],
 ['survey',
  'of',
  'user',
  'opinion',
  'of',
  'computer',
  'system',
  'response',
  'time']]

#### gensim doctionary object 

In [5]:
dictionary = corpora.Dictionary(text)  ## construct dictionary
dictionary.save('vocab.dict')            ## save dictionary

In [6]:
print(dictionary)
print('\n')
print(dictionary.token2id)
print('\n')
print('id 2 token: {}'.format(dictionary[0]))

Dictionary(41 unique tokens: ['error', 'user', 'survey', 'eps', 'iv']...)


{'error': 21, 'user': 14, 'survey': 11, 'eps': 15, 'iv': 35, 'relation': 24, 'of': 8, 'to': 25, 'measurement': 22, 'management': 16, 'opinion': 9, 'trees': 29, 'applications': 1, 'generation': 27, 'in': 32, 'binary': 26, 'paths': 34, 'perceived': 23, 'engineering': 19, 'quasi': 38, 'graph': 31, 'the': 17, 'computer': 2, 'lab': 6, 'ordering': 37, 'abc': 0, 'system': 12, 'random': 28, 'minors': 36, 'for': 3, 'machine': 7, 'intersection': 33, 'unordered': 30, 'and': 18, 'response': 10, 'human': 4, 'interface': 5, 'widths': 40, 'well': 39, 'testing': 20, 'time': 13}


id 2 token: abc


#### use dictionary to convert document to bow
- The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a bag-of-words--a sparse vector, in the form of [(word_id, word_count), ...].

In [7]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored
for v,c in new_vec:
    print('{} : {}'.format(dictionary[v],c))

[(2, 1), (4, 1)]
computer : 1
human : 1


#### convert document to corpus 

In [8]:
## converta all document to bow 
corpus = [dictionary.doc2bow(t) for t in text]
corpora.MmCorpus.serialize('corpus.mm', corpus)  # store to disk, for later use

#### load corpus

In [9]:
corpus = corpora.MmCorpus('corpus.mm')
print(corpus)

MmCorpus(9 documents, 41 features, 65 non-zero entries)


##### you can easily turn your corpus into a dense np array

In [10]:
dense_bow = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary)) ## column as a document
dense_bow = dense_bow.T ## convert to normal document term matrix 

In [11]:
corpus[0]

[(0, 1.0),
 (1, 1.0),
 (2, 1.0),
 (3, 1.0),
 (4, 1.0),
 (5, 1.0),
 (6, 1.0),
 (7, 1.0)]

In [12]:
for v,c in corpus[0]:
    print(dictionary[v])

abc
applications
computer
for
human
interface
lab
machine


In [13]:
dense_bow[0,:]

array([1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.], dtype=float32)

#### convert dense matrix to gensim 

In [14]:
corpus_gensim = gensim.matutils.Dense2Corpus(dense_bow)

In [15]:
corpus[0]

[(0, 1.0),
 (1, 1.0),
 (2, 1.0),
 (3, 1.0),
 (4, 1.0),
 (5, 1.0),
 (6, 1.0),
 (7, 1.0)]