In [1]:
import numpy as np
import lda
from scipy.sparse import coo_matrix

In [2]:
docs = {
    'doc1': ['python', 'text', 'data', 'nlp', 'data', 'matrix', 'mining'],
    'doc2': ['data', 'science', 'data', 'processing', 'cleaning', 'data'],
    'doc3': ['r', 'data', 'science', 'text', 'mining', 'nlp'],
    'doc4': ['programming', 'c', 'algorithms', 'data', 'structures'],
}

In [3]:
n_nonzero = 0
vocab = set()
for docterms in docs.values():
    unique_terms = set(docterms)    # all unique terms of this doc
    vocab |= unique_terms           # set union: add unique terms of this doc
    n_nonzero += len(unique_terms)  # add count of unique terms in this doc

# make a list of document names
# the order will be the same as in the dict
docnames = list(docs.keys())

In [4]:
print(len(vocab))
print(n_nonzero)
vocab

14
21


{'algorithms',
 'c',
 'cleaning',
 'data',
 'matrix',
 'mining',
 'nlp',
 'processing',
 'programming',
 'python',
 'r',
 'science',
 'structures',
 'text'}

In [5]:
docnames = np.array(docnames)
vocab = np.array(list(vocab))  

In [6]:
vocab_sorter = np.argsort(vocab)    # indices that sort "vocab"
ndocs = len(docnames)
nvocab = len(vocab)

In [7]:
data = np.empty(n_nonzero, dtype=np.intc)     # all non-zero term frequencies at data[k]
rows = np.empty(n_nonzero, dtype=np.intc)     # row index for kth data item (kth term freq.)
cols = np.empty(n_nonzero, dtype=np.intc)     # column index for kth data item (kth term freq.)

In [8]:
ind = 0     # current index in the sparse matrix data
# go through all documents with their terms
for docname, terms in docs.items():
    # find indices into  such that, if the corresponding elements in  were
    # inserted before the indices, the order of  would be preserved
    # -> array of indices of  in    
    term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)]    

    # count the unique terms of the document and get their vocabulary indices
    uniq_indices, counts = np.unique(term_indices, return_counts=True)
    n_vals = len(uniq_indices)  # = number of unique terms
    ind_end = ind + n_vals  #  to  is the slice that we will fill with data

    data[ind:ind_end] = counts                  # save the counts (term frequencies)
    cols[ind:ind_end] = uniq_indices            # save the column index: index in 
    doc_idx = np.where(docnames == docname)     # get the document index for the document name
    rows[ind:ind_end] = np.repeat(doc_idx, n_vals)  # save it as repeated value

    ind = ind_end  # resume with next document -> add data to the end


In [9]:
dtm = coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=np.intc)

In [11]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.ERROR)

In [12]:
model = lda.LDA(n_topics=3, n_iter=1000, random_state=1)

model.fit(dtm)

topic_word = model.topic_word_
n_top_words = 3

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

  if sparse and not np.issubdtype(doc_word.dtype, int):


Topic 0: nlp mining r
Topic 1: data science text
Topic 2: algorithms programming c
