# LDA and HLDA

References:<br>
https://github.com/lda-project/lda <br>
https://datascience.blog.wzb.eu/2016/06/17/creating-a-sparse-document-term-matrix-for-topic-modeling-via-lda/ <br>
https://github.com/joewandy/hlda

# Process data set

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import string
import glob

In [2]:
stopset = stopwords.words('english') + ["com", "www", "http", "url", "org"]

In [3]:
corpus = []
vocab = set()
n_nonzero = 0

stemmer = PorterStemmer()
for filename in glob.glob('/home/juan/Documents/new_life/*.txt'):
    with open(filename) as f:
        try:
            doc = f.read().splitlines()
            doc = '. '.join(doc).translate(str.maketrans(dict.fromkeys(string.punctuation, " "))).translate(str.maketrans(dict.fromkeys('0123456789')))  # strip punctuations            
            tokens = word_tokenize(str(doc))
            filtered = []
            for w in tokens:
                w = stemmer.stem(w.lower())  # use Porter's stemmer
                if len(w) < 3:  # remove short tokens
                    continue
                if w in stopset:  # remove stop words
                    continue
                filtered.append(w)
            unique_terms = set(filtered)
            n_nonzero += len(unique_terms)
            vocab.update(unique_terms)
            corpus.append(filtered)            

        except UnicodeDecodeError:
            print ('Failed to load', filename)

# Document term matrix

In [4]:
import numpy as np
from scipy.sparse import coo_matrix
import lda

In [5]:
vocab = np.array(list(vocab))  
vocab_sorter = np.argsort(vocab)
ndocs = len(corpus)
nvocab = len(vocab)

In [6]:
data = np.empty(n_nonzero, dtype=np.intc)     # all non-zero term frequencies at data[k]
rows = np.empty(n_nonzero, dtype=np.intc)     # row index for kth data item (kth term freq.)
cols = np.empty(n_nonzero, dtype=np.intc)     # column index for kth data item (kth term freq.)

In [7]:
ind = 0     # current index in the sparse matrix data
# go through all documents with their terms
i = 0
for terms in corpus:
    # find indices into  such that, if the corresponding elements in  were
    # inserted before the indices, the order of  would be preserved
    # -> array of indices of  in 
    term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)]

    # count the unique terms of the document and get their vocabulary indices
    uniq_indices, counts = np.unique(term_indices, return_counts=True)
    n_vals = len(uniq_indices)  # = number of unique terms
    ind_end = ind + n_vals  #  to  is the slice that we will fill with data

    data[ind:ind_end] = counts                  # save the counts (term frequencies)
    cols[ind:ind_end] = uniq_indices            # save the column index: index in 
    rows[ind:ind_end] = np.repeat(i, n_vals)  # save it as repeated value
    
    i += 1
    ind = ind_end  # resume with next document -> add data to the end

In [8]:
dtm = coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=np.intc)

# LDA

In [9]:
model = lda.LDA(n_topics=20, n_iter=1000, random_state=1)

import logging
logger = logging.getLogger()
logger.setLevel(logging.ERROR)

model.fit(dtm)

topic_word = model.topic_word_
n_top_words = 5

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

  if sparse and not np.issubdtype(doc_word.dtype, int):


Topic 0: dog cat breed align greyhound
Topic 1: plant speci flower leav jpg
Topic 2: bird speci statu iucn bat
Topic 3: last first journal titl genu
Topic 4: viru infect diseas cell use
Topic 5: speci statu iucn author genu
Topic 6: binomi speci beetl cerambycida insect
Topic 7: speci author genu speciesbox famili
Topic 8: speci marin clade imag famili
Topic 9: speci new india imag zealand
Topic 10: date state new web first
Topic 11: film episod batman seri titl
Topic 12: speci genu imag author fungi
Topic 13: sfn book jpg file death
Topic 14: convert australia abbr flower long
Topic 15: nbsp brown white dark spot
Topic 16: speci genu plant famili unrank
Topic 17: imag moth binomi lepidoptera speci
Topic 18: thi femal male also may
Topic 19: speci fish statu author genu


# HLDA

In [10]:
vocab = sorted(list(vocab))
vocab_index = {}
for i, w in enumerate(vocab):
    vocab_index[w] = i

In [12]:
print(len(corpus))
print(len(vocab))

296042
850757


In [13]:
new_corpus = []
for doc in corpus:
    new_doc = []
    for word in doc:
        word_idx = vocab_index[word]
        new_doc.append(word_idx)
    new_corpus.append(new_doc)

In [14]:
n_samples = 1000       # no of iterations for the sampler
alpha = 10.0          # smoothing over level distributions
gamma = 1.0           # CRP smoothing parameter; number of imaginary customers at next, as yet unused table
eta = 0.1             # smoothing over topic-word distributions
num_levels = 30        # the number of levels in the tree
display_topics = 200   # the number of iterations between printing a brief summary of the topics so far
n_words = 5           # the number of most probable words to print for each topic after model estimation
with_weights = False  # whether to print the words with the weights

In [None]:
from hlda.sampler import HierarchicalLDA
hlda = HierarchicalLDA(new_corpus, vocab, alpha=alpha, gamma=gamma, eta=eta, num_levels=num_levels)
hlda.estimate(n_samples, display_topics=display_topics, n_words=n_words, with_weights=with_weights)

SyntaxError: invalid syntax (<ipython-input-1-f8a50f1227b9>, line 1)

## 4. Visualise results

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from ipywidgets import widgets
from IPython.core.display import HTML, display

In [None]:
colour_map = {
    0: 'blue',
    1: 'red',
    2: 'green'
}

def show_doc(d=0):
    node = hlda.document_leaves[d]
    path = []
    while node is not None:
        path.append(node)
        node = node.parent
    path.reverse()

    n_words = 10
    with_weights = False
    for n in range(len(path)):
        node = path[n]
        colour = colour_map[n]
        msg = 'Level %d Topic %d: ' % (node.level, node.node_id)
        msg += node.get_top_words(n_words, with_weights)
        output = '<h%d><span style="color:%s">%s</span></h3>' % (n + 1, colour, msg)
        display(HTML(output))

    display(HTML('<hr/><h5>Processed Document</h5>'))

    doc = corpus[d]
    output = ''
    for n in range(len(doc)):
        w = doc[n]
        l = hlda.levels[d][n]
        colour = colour_map[l]
        output += '<span style="color:%s">%s</span> ' % (colour, w)
    display(HTML(output))

If you run this notebook locally, you'd be able to flip through the documents in the corpus and see the topic assignments of individual words of the document.

In [None]:
widgets.interact(show_doc, d=(0, len(corpus)-1))

## 5. Dump the hlda object for further use later

https://stackoverflow.com/questions/18474791/decreasing-the-size-of-cpickle-objects

In [None]:
import _pickle as cPickle
import gzip


def save_zipped_pickle(obj, filename, protocol=-1):
    with gzip.open(filename, 'wb') as f:
        cPickle.dump(obj, f, protocol)


def load_zipped_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        loaded_object = cPickle.load(f)
        return loaded_object

save_zipped_pickle(hlda, 'bbc_hlda.p')