# LDA

In [3]:
import pandas as pd
import numpy as np

import os
import random
import bisect
import stopwords

### GLOBAL VARIABLES

In [5]:
eng_dict = []
#location of your English dictionary
dict_file = open("dictionary.txt",'r')
for line in dict_file:
    #I assume your dictionary's words are separated by whitespace,
    #but it it's commas you can do split(',')
    items = line.split()
    for word in items:
        eng_dict.append(word)

#word_index serves as a two-way lookup table: given a word, return its index,
#and vice-versa. Also has key "n_words" which gives number of words in
#vocabulary.
word_index = {"n_words":0}
#The coefficients alpha and beta are derived from the multinomial distribution
#assumed by the model. alpha=0.1 and beta=0.0002 are generally good.
#The lower alpha, the fewer topics per document
#alpha should be low but nonzero so that words can group by document as well.
alpha = 0.1
#The lower beta, more extreme each topic's word frequencies
beta = 0.0002

In [7]:
eng_dict[0:10]

['yeah',
 'explosions',
 'possibly',
 'sure',
 'simply',
 'Unfortunate',
 'get',
 'unexpected',
 'hella',
 'picture']

## Get file names

In [9]:
def get_fnames():
    #your function for creating a list of files to use
    names = []
    dir = "data/"
    exts = ["txt"]
    for root, dirs, files in os.walk(dir):
        for f in files:
            items = f.split('.')
            if len(items) > 1 and items[1] in exts:
                names.append(root+"/"+f)
    return names

In [10]:
get_fnames()

['data//doc0.txt',
 'data//doc1.txt',
 'data//doc10.txt',
 'data//doc100.txt',
 'data//doc1000.txt',
 'data//doc1001.txt',
 'data//doc1002.txt',
 'data//doc1003.txt',
 'data//doc1004.txt',
 'data//doc1005.txt',
 'data//doc1006.txt',
 'data//doc1007.txt',
 'data//doc1008.txt',
 'data//doc1009.txt',
 'data//doc101.txt',
 'data//doc1010.txt',
 'data//doc1011.txt',
 'data//doc1012.txt',
 'data//doc1013.txt',
 'data//doc1014.txt',
 'data//doc1015.txt',
 'data//doc1016.txt',
 'data//doc1017.txt',
 'data//doc1018.txt',
 'data//doc1019.txt',
 'data//doc102.txt',
 'data//doc1020.txt',
 'data//doc1021.txt',
 'data//doc1022.txt',
 'data//doc1023.txt',
 'data//doc1024.txt',
 'data//doc1025.txt',
 'data//doc1026.txt',
 'data//doc1027.txt',
 'data//doc1028.txt',
 'data//doc1029.txt',
 'data//doc103.txt',
 'data//doc1030.txt',
 'data//doc1031.txt',
 'data//doc1032.txt',
 'data//doc1033.txt',
 'data//doc1034.txt',
 'data//doc1035.txt',
 'data//doc1036.txt',
 'data//doc1037.txt',
 'data//doc1038.txt',


## Read doc

In [21]:
def read_doc(fname):
    #If you are using files that aren't plain text, you may have to change this
    f = open(fname, 'r')
    doc = []
    for line in f:
        if line[0] == "%":
            continue
        phrases = line.split()
        for phrase in phrases:
            p1 = phrase[-1]
            
            phrase = phrase.lower()
            
            if phrase not in word_index:
                n_words = word_index["n_words"]
                word_index[phrase] = n_words
                word_index[n_words] = phrase
                word_index["n_words"] += 1
            ind = word_index[phrase]
            doc.append(ind)
    return doc

In [25]:
doc = read_doc('data/doc0.txt')
doc

[0, 1, 2, 3, 4, 5, 6, 7]

## Read docs

In [13]:
def read_docs():
    #turn each document into a list of words
    docs = []
    fnames = get_fnames()
    for fname in fnames:
        docs.append(read_doc(fname))

    return docs

In [30]:
docs = read_docs()
docs[0]

[0, 1, 2, 3, 4, 5, 6, 7]

## Random choice

In [31]:
def random_choice(probs):
    #given a list of probabilities, randomly return an index i according to
    #that index's probability.
    partials = []
    psum = 0.
    for p in probs:
        psum += p
        partials.append(psum)

    choice = random.random()*psum
    #bisect_right does binary search to find minimal k where partials[k]>choice
    return bisect.bisect_right(partials, choice)


In [40]:
[random_choice([0.3,0.5,0.2]) for _ in range(25)]

[1, 1, 1, 1, 1, 2, 2, 1, 0, 0, 2, 2, 1, 1, 0, 0, 1, 2, 1, 1, 0, 0, 1, 2, 2]

## Probs

In [41]:
def probs(v, nkm, nkr, nk, n_topics):
    #get something proportional to the
    #probability for word n to be in
    #each topic k
    #given that there are n_topics topics. The word is vth from vocabulary.
    #nkm is number of words from this document in kth topic
    #nkr is number of times rth word from vocab appears in kth topic
    #nk is number of words in kth topic
    n_words = word_index["n_words"]
    res = [0]*n_topics   #a probability to be in each topic

    for k in range(n_topics):
        res[k] = (nkm[k]+alpha)*(nkr[k][v]+beta)/(nk[k]+n_words*beta)
    return res

## Get topics

In [42]:
def get_topics(iters, w_counts, docs, n_topics):
    #returns total number of words in each topic and relative distribution of
    #words in topic compared to distribution across all documents
    n_words = word_index["n_words"]
    zs = []
    nkr = [[0]*n_words for _ in range(n_topics)] #(k,r)th element is number of
    #times rth word from vocab appears in topic k
    nkm = [[0]*n_topics for _ in range(len(docs))]
    #(m,k)th element is number of words in document m are in kth topic
    nk  = [0]*n_topics #number of words in each topic

    n_words_total = 0 #number of words in all documents, INCLUDING repetition
    for i in range(len(docs)):
        zs.append([])
        for j in range(len(docs[i])):
            topic = random.randint(0,n_topics-1)
            zs[i].append(topic)
            ind = docs[i][j]
            nkm[i][topic] += 1
            nkr[topic][ind] += 1
            nk[topic] += 1
            n_words_total += 1
    for it in range(iters):
        print ("Iteration",it)
        for i in range(len(docs)):
            for j in range(len(docs[i])):
                ind = docs[i][j]
                k = zs[i][j]
                nkm[i][k] -= 1
                nkr[k][ind] -= 1
                nk[k] -= 1
                ps = probs(ind, nkm[i], nkr, nk, n_topics)
                newk = random_choice(ps)
                nkm[i][newk] += 1
                nkr[newk][ind] += 1
                nk[newk] += 1
                zs[i][j] = newk

    for k in range(n_topics):
        for v in range(n_words):
            nkr[k][v] /= nk[k]+0.
            nkr[k][v] -= (w_counts[v]+ 0.)/n_words_total
    return [[nk[k],nkr[k]] for k in range(n_topics)]

## Display topics

In [43]:
def display_topics(topics):
    #take output of topics and make it look pretty
    relevant = 10	#I only care about 10 most frequent words in topic
    n_words = word_index["n_words"]
    for nk,t in topics:
        top = [[-1,0]]*relevant
        for i in range(n_words):
            for j in range(relevant):
                if t[i] > top[j][1]:
                    top[j+1:] = top[j:-1]
                    top[j] = [word_index[i],t[i]]
                    break
        print ("\n"+"==TOPIC==", " with number of words =", nk)
        for rank in top:
            print (rank)

## Train

### Read docs

In [44]:
docs = read_docs()
n_topics = 12 #number of topics
n_words = word_index["n_words"]
w_counts = [0]*n_words

### Training process

In [45]:

#Number of iterations should probably be about 10 for decent convergence
#each iteration iterates over all meaningful words in each document, so this
#could be expensive for large document sets
iters = 10
for doc in docs:
    for word in doc:
        w_counts[word] += 1
topics = get_topics(iters, w_counts, docs , n_topics)
display_topics(topics)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9

==TOPIC==  with number of words = 7695
['vinci', 0.09089181838868673]
['code', 0.0902290186996749]
['da', 0.08855263710213704]
['awesome', 0.047306751783209394]
['was', 0.030813972479719245]
['left', 0.017850495288753074]
['movie', 0.017220176187295537]
['up', 0.017148297693269674]
['right', 0.015989518111817074]
['down', 0.014778630092705702]

==TOPIC==  with number of words = 8127
['potter', 0.03745678983385861]
['harry', 0.03290406435090057]
['i', 0.026089390582155834]
['to', 0.021589267242984253]
['think', 0.020501291217046053]
['reading', 0.01892916099463646]
['it', 0.017301580200554362]
['one', 0.01710820929563971]
['story', 0.016475452760618]
['is', 0.01588231774528528]

==TOPIC==  with number of words = 6587
['like', 0.06625496874115727]
['sucks', 0.03087029589066726]
['i', 0.02665545740628559]
['would', 0.02512045967536993]
['this', 0.024842379244129457]
['a