<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/gensim_corpora_vector_spaces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
import logging
from pprint import pprint  # pretty-printer
from collections import defaultdict
from gensim import corpora # mwahaha!
from smart_open import open  # for transparently opening remote files
from six import iteritems
import gensim # why then before we imported gensim?
import numpy as np
import scipy.sparse

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [3]:
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [4]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [5]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
print(dictionary)

2020-07-26 15:49:39,355 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-07-26 15:49:39,356 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2020-07-26 15:49:39,357 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-07-26 15:49:39,363 : INFO : saved /tmp/deerwester.dict


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [6]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [7]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 1)]


In [8]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
print(corpus)

2020-07-26 15:49:39,398 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-07-26 15:49:39,400 : INFO : saving sparse matrix to /tmp/deerwester.mm
2020-07-26 15:49:39,402 : INFO : PROGRESS: saving document #0
2020-07-26 15:49:39,405 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2020-07-26 15:49:39,406 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


The code below portrays the way to stream corpora, in a way that does not keep all the documents within RAM memory simultaneously.

In [9]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('https://radimrehurek.com/gensim/mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

What is interesting, gensim corpus can be any object that returns documents when it is iterated over---be it Pandas DataFrame, list, array, or other formats.

In [10]:
# This flexibility allows you to create your own corpus classes that stream the
# documents directly from disk, network, database, dataframes... The models
# in Gensim are implemented such that they don't require all vectors to reside
# in RAM at once. You can even create the documents on the fly!

This explains the somewhat tricky way of splitting the documents and corpora I complained about yesterday. What I assumed was lack of automatization was, in fact, due to the flexibility of gensim corpus-management functions.

In [11]:
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x7fbcff253ac8>


In [12]:
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


In [13]:
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [
    dictionary.token2id[stopword]
    # AHA! So gensim has pre-defined list of stopwords!
    for stopword in stoplist
    if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1] # nice code
dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
dictionary.compactify()  # remove gaps in id sequence after words that were removed
print(dictionary)

2020-07-26 15:49:39,737 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-07-26 15:49:39,738 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


Corpora are saved to disk one file at a time, mirroring the way they are read into memory. Below are several different ways of saving the corpus to the disk. The documentation suggests gensim can be used as an efficient corpus conversion tool, with corpus being read and immediately saved in another format within the simplistic gensim interface.

In [14]:
corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it

corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)

2020-07-26 15:49:39,747 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-07-26 15:49:39,749 : INFO : saving sparse matrix to /tmp/corpus.mm
2020-07-26 15:49:39,750 : INFO : PROGRESS: saving document #0
2020-07-26 15:49:39,752 : INFO : saved 2x2 matrix, density=25.000% (1/4)
2020-07-26 15:49:39,753 : INFO : saving MmCorpus index to /tmp/corpus.mm.index


In [15]:
corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)

2020-07-26 15:49:39,763 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-07-26 15:49:39,765 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index
2020-07-26 15:49:39,767 : INFO : no word id mapping provided; initializing from corpus
2020-07-26 15:49:39,768 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
2020-07-26 15:49:39,770 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
2020-07-26 15:49:39,772 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
2020-07-26 15:49:39,774 : INFO : no word id mapping provided; initializing from corpus
2020-07-26 15:49:39,778 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low
2020-07-26 15:49:39,781 : INFO : saving LowCorpus index to /tmp/corpus.low.index


In [16]:
corpus = corpora.MmCorpus('/tmp/corpus.mm')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-07-26 15:49:39,791 : INFO : loaded corpus index from /tmp/corpus.mm.index
2020-07-26 15:49:39,793 : INFO : initializing cython corpus reader from /tmp/corpus.mm
2020-07-26 15:49:39,794 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries


In [17]:
print(corpus)

MmCorpus(2 documents, 2 features, 1 non-zero entries)


In [18]:
# one way of printing a corpus: load it entirely into memory
print(list(corpus))  # calling list() will convert any sequence to a plain Python list

[[(1, 0.5)], []]


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [19]:
# another way of doing it: print one document at a time, making use of the streaming interface
for doc in corpus:
    print(doc)

[(1, 0.5)]
[]


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [20]:
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)

2020-07-26 15:49:39,830 : INFO : no word id mapping provided; initializing from corpus
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-07-26 15:49:39,832 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
2020-07-26 15:49:39,834 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
2020-07-26 15:49:39,836 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index


In [21]:
numpy_matrix = np.random.randint(10, size=[5, 2])  # random matrix as an example
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
# numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)

In [22]:
scipy_sparse_matrix = scipy.sparse.random(5, 2)  # random sparse matrix as example
corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)