# LSA with EM

## Make a word count matrix

In [25]:
import numpy as np

num_docs = 20
num_words = 100
num_topics = 3
num_words_per_doc = 20


def generate_mat(num_docs, num_words, num_topics, num_words_per_doc, p, theta):
    mat = np.zeros((num_docs, num_words))
    delta = np.zeros((num_docs,num_words,num_topics))
    for d in range(num_docs):
        nwt = np.random.multinomial(num_words_per_doc, p[d,:])
        for t, n in np.ndenumerate(nwt):
            delta[d,:,t] = np.random.multinomial(n, theta[t[0],:])
            mat[d,:] += delta[d,:,t][0,:]
    return mat, delta
        
def generate_data(num_docs, num_words, num_topics, num_words_per_doc):
    p = np.zeros((num_docs, num_topics))
    for d in range(num_docs):
        t = d % num_topics
        p[d,t] = .8
        p[d,:t] = .2/(num_topics-1)
        p[d,t+1:] = .2/(num_topics-1)
        
    theta = np.ones((num_topics, num_words))
    n_useful_words = 5 * num_topics
    
    for w in range(n_useful_words):
        t = w % num_topics
        theta[t,w] = 100.
        theta[:t,w] = 10.
        theta[t+1:,w] = 10.
    for t in range(num_topics):
        theta[t,:] = theta[t,:] / np.sum(theta[t,:])
        
    mat,delta = generate_mat(num_docs, num_words, num_topics, num_words_per_doc, p, theta)
    return mat, delta, p, theta


In [28]:
mat, delta,p, theta = generate_data(num_docs,num_words,num_topics,num_words_per_doc)
print(get_loglik(mat,p,theta))
print(get_qopt(mat,delta,p,theta))

-1171.6163899854826
-1316.9999313461783


In [29]:
gamma = np.zeros((num_docs,num_words,num_topics))
estep(mat, gamma, num_topics, p, theta)
print(mat[0,3])
print(gamma[0,3,:])
mat2=np.sum(gamma,axis=2)
mat2-mat

3.0
[2.92682927 0.03658537 0.03658537]


array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 4.4408921e-16, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 4.4408921e-16, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]])

In [30]:
print(p[0,:])
print(theta[:,0])
print([p[0,t]*theta[t,0] for t in range(num_topics)])
tmp=p[0,:]*theta[:,0]
print(sum(tmp))
tmp / sum(tmp)

[0.8 0.1 0.1]
[0.1459854  0.01459854 0.01459854]
[0.11678832116788321, 0.0014598540145985403, 0.0014598540145985403]
0.11970802919708029


array([0.97560976, 0.01219512, 0.01219512])

In [31]:
def init_params(num_docs, num_words, num_topics):
    p = np.zeros((num_docs, num_topics))
    for d in range(num_docs):
        p[d,:] = np.random.dirichlet(np.ones((num_topics)))
        
    theta = np.zeros((num_topics, num_words))
    for t in range(num_topics):
        theta[t,:] = np.random.dirichlet(np.ones((num_words)))
        
    return p, theta

def guarded_log(x):
    return 0 if x == 0 else np.log(x)

def estep(mat, gamma, num_topics, p, theta):
    num_docs, num_words = mat.shape
    for d in range(num_docs):
        for w in range(num_words):
            nwd = mat[d,w]
            if nwd == 0:
                gamma[d,w,:] *= 0.0
                next
                
            tmp = p[d,:] * theta[:,w]
            denom = sum(tmp)
            if denom == 0:
                gamma[d,w,:] *= 0.0
                next
                
            gamma[d,w,:] = nwd * tmp / denom
                    
                
def mstep(mat, num_topics, gamma, pseudo_count=0.01):
    num_docs, num_words = mat.shape
    p = np.zeros((num_docs, num_topics))
    
    for d in range(num_docs):
        s_d = np.sum(gamma[d,:,:] + pseudo_count, axis=0)
        denom = np.sum(s_d)
        p[d,:] = s_d / np.sum(s_d)
        
    theta = np.zeros((num_topics, num_words))
    
    for t in range(num_topics):
        s_t = np.sum(gamma[:,:,t] + pseudo_count, axis=0)
        theta[t,:] = s_t / np.sum(s_t)
    return p, theta
            
def get_loglik(mat, p, theta):
    num_docs, num_words = mat.shape
    num_topics = p.shape[1]
    res = 0
    for d in range(num_docs):
        for w in range(num_words):
            nwd = mat[d,w]
            if nwd == 0:
                next
                
            tsum = 0
            for t in range(num_topics):
                tsum += p[d,t] * theta[t,w]
            res += nwd * guarded_log(tsum)            
    return res
    
def get_qopt(mat, gamma, p, theta):
    num_docs, num_words, num_topics = gamma.shape
    res = 0.0
    for d in range(num_docs):
        for w in range(num_words):
            for t in range(num_topics):
                res += gamma[d,w,t] * guarded_log(p[d,t])
                res += gamma[d,w,t] * guarded_log(theta[t,w])
    return res

def check_convergence(cur_it, cur_llik, new_llik, max_iter, eps):
    return cur_it >= max_iter or np.abs(new_llik - cur_llik) < eps

def check_gamma(mat, gamma):
    emat = np.sum(gamma,axis=2)
    print(np.mean(np.abs(mat-emat)))

def plsa_em(mat, num_topics=10, max_iter=1000, eps=1e-6):
    num_docs, num_words = mat.shape
    print_template = "It: {0:d}, loglik: {1:.5f}, old_q: {2:.5f}, new_q: {3:.5f}"
    p, theta = init_params(num_docs, num_words, num_topics)
    gamma = np.zeros((num_docs, num_words, num_topics))
    
    cur_llik = get_loglik(mat, p, theta)
    
    i = 0
    while True:
        estep(mat, gamma, num_topics, p, theta)
#        check_gamma(mat, gamma)
        old_q = get_qopt(mat, gamma, p, theta)
        new_p, new_theta = mstep(mat, num_topics, gamma)
        new_llik = get_loglik(mat, new_p, new_theta)
        new_q = get_qopt(mat, gamma, new_p, new_theta)
        
        print(print_template.format(i, new_llik, old_q, new_q))
        if check_convergence(i, cur_llik, new_llik, max_iter, eps):
            break

        i += 1
        p, theta = new_p, new_theta
    return p, theta


In [32]:
p_hat, theta_hat = plsa_em(mat, num_topics=3, max_iter=20, eps=1e-3)

It: 0, loglik: -1263.60237, old_q: -2259.40364, new_q: -1557.24709
It: 1, loglik: -1251.57073, old_q: -1586.01830, new_q: -1580.38329
It: 2, loglik: -1240.73552, old_q: -1580.62095, new_q: -1575.32147
It: 3, loglik: -1225.86391, old_q: -1559.56369, new_q: -1551.68801
It: 4, loglik: -1205.75114, old_q: -1521.32931, new_q: -1509.91550
It: 5, loglik: -1182.71711, old_q: -1467.18978, new_q: -1453.08790
It: 6, loglik: -1161.67767, old_q: -1406.07877, new_q: -1392.22251
It: 7, loglik: -1145.77408, old_q: -1351.50258, new_q: -1340.56900
It: 8, loglik: -1134.64307, old_q: -1310.01184, new_q: -1302.25815
It: 9, loglik: -1126.90491, old_q: -1279.79561, new_q: -1274.38192
It: 10, loglik: -1121.58238, old_q: -1257.57727, new_q: -1253.80107
It: 11, loglik: -1118.13336, old_q: -1241.37212, new_q: -1238.85912
It: 12, loglik: -1116.10815, old_q: -1230.22501, new_q: -1228.71130
It: 13, loglik: -1115.02971, old_q: -1223.19301, new_q: -1222.38000
It: 14, loglik: -1114.49499, old_q: -1219.05875, new_q: -1

In [33]:
print(p[0,:])
print(p_hat[0,:])

[0.8 0.1 0.1]
[0.13908266 0.13139563 0.72952171]


## Datasete

Based on https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925


In [34]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)


In [2]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [9]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import nltk
nltk.download('wordnet')

stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
    

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hcorrada/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
processed_docs = []
for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

In [48]:
dictionary = gensim.corpora.Dictionary(processed_docs[:1000])
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=10000)

In [49]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1a2cf42780>

In [50]:
corpus = [dictionary.doc2bow(doc) for doc in processed_docs[:1000]]

In [51]:
doc = corpus[10]
for i in range(len(doc)):
    print ("Word {} (\"{}\") appears {} times.".format(doc[i][0],
                                                      dictionary[doc[i][0]],
                                                      doc[i][1]))

Word 10 ("model") appears 1 times.
Word 22 ("clock") appears 1 times.
Word 33 ("keyword") appears 1 times.
Word 44 ("summari") appears 1 times.
Word 56 ("email") appears 1 times.
Word 71 ("opinion") appears 1 times.
Word 93 ("worth") appears 1 times.
Word 106 ("nice") appears 1 times.
Word 160 ("hard") appears 1 times.
Word 255 ("owner") appears 1 times.
Word 265 ("bike") appears 2 times.
Word 266 ("motor") appears 1 times.
Word 267 ("paint") appears 1 times.
Word 268 ("recommend") appears 1 times.
Word 269 ("run") appears 1 times.
Word 270 ("sell") appears 1 times.


In [52]:
from gensim.matutils import corpus2csc
doc_mat = corpus2csc(corpus)
doc_mat

<1020x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 34157 stored elements in Compressed Sparse Column format>

In [54]:
res = plsa_em(doc_mat.T, num_topics=8)

It: 0, loglik: -356164.39166, old_q: -445294.19088, new_q: -444593.49228
It: 1, loglik: -356212.38503, old_q: -456918.65519, new_q: -458299.64060
It: 2, loglik: -356263.07078, old_q: -461133.32131, new_q: -461501.82351
It: 3, loglik: -356277.04034, old_q: -462430.57888, new_q: -462554.76819
It: 4, loglik: -356265.98999, old_q: -462891.15270, new_q: -462931.05887
It: 5, loglik: -356236.33629, old_q: -463017.95787, new_q: -463019.14347
It: 6, loglik: -356190.64085, old_q: -462969.68858, new_q: -462947.75935
It: 7, loglik: -356129.92763, old_q: -462804.70669, new_q: -462765.72119
It: 8, loglik: -356055.03684, old_q: -462547.13968, new_q: -462493.93246
It: 9, loglik: -355967.42125, old_q: -462209.55656, new_q: -462144.12826
It: 10, loglik: -355869.52741, old_q: -461802.44518, new_q: -461726.97359
It: 11, loglik: -355764.71032, old_q: -461338.41137, new_q: -461255.56957
It: 12, loglik: -355656.73868, old_q: -460833.14921, new_q: -460745.94736
It: 13, loglik: -355549.18129, old_q: -460304.25

KeyboardInterrupt: 