In [1]:
import pprint

# 1 core concepts

### definition

    document: some text
    corpus: a collection of documents
    vector: a mathmatically convinient representation of a document
    model: an algorithm for transforming vectors from one representation to another

In [2]:
text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [3]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [4]:
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [5]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [6]:
pprint.pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


### Vector

In [8]:
# represent a new sentence
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [9]:
# representation the whole texts
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


### Model

In [10]:
from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the "system minors" string
words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


In [15]:
# tfidf is a model
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x1505cd390>

In [13]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

In [16]:
# index is a similarity function

In [18]:
query_document = 'system engineering'.split()
query_bow = dictionary.doc2bow(query_document)
sims = index[tfidf[query_bow]] # sims is a result calculated by index function
pprint.pprint(list(enumerate(sims)))

[(0, 0.0),
 (1, 0.32448703),
 (2, 0.41707572),
 (3, 0.7184812),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0)]


In [20]:
# namely
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


# 2 Corpora and Vector Spaces

In [21]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [25]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
# loaded_dict = corpora.Dictionary.load('/tmp/deerwester.dict') # load the dictionary
print(dictionary)

2021-03-10 18:08:07,163 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-03-10 18:08:07,164 : INFO : built Dictionary(35 unique tokens: ['abc', 'applications', 'computer', 'human', 'interface']...) from 9 documents (total 52 corpus positions)
2021-03-10 18:08:07,165 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
2021-03-10 18:08:07,166 : INFO : saved /tmp/deerwester.dict


Dictionary(35 unique tokens: ['abc', 'applications', 'computer', 'human', 'interface']...)


In [26]:
#
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
# loaded_corpus = corpora.MmCorpus('/tmp/deerwester.mm') # load the corpus
print(corpus)

2021-03-10 18:08:08,272 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2021-03-10 18:08:08,274 : INFO : saving sparse matrix to /tmp/deerwester.mm
2021-03-10 18:08:08,274 : INFO : PROGRESS: saving document #0
2021-03-10 18:08:08,275 : INFO : saved 9x35 matrix, density=16.190% (51/315)
2021-03-10 18:08:08,276 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(2, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(4, 1), (10, 1), (12, 1), (13, 1), (14, 1)], [(3, 1), (10, 2), (13, 1), (15, 1), (16, 1)], [(8, 1), (11, 1), (12, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(24, 1), (26, 1), (27, 1), (28, 1)], [(24, 1), (26, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)], [(9, 1), (26, 1), (30, 1)]]


### corpus streming - one document at a time

In [33]:
from smart_open import open  # for transparently opening remote files

class MyCorpus:
    def __iter__(self):
        for line in open('/Users/wegzheng/tmp/1.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

In [34]:
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x150f577d0>


In [35]:
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

[]
[]
[]
[]
[(3, 2)]
[]
[(10, 1)]
