In [7]:
import numpy as np
import time
import codecs
import jieba
import re

In [8]:
# Preprocessing (word segmentation, remove stop words, assign a number to each word, the document is represented by a list of word numbers)
def preprocessing():
    # Read stop word file
    file = codecs.open('stopwords.dic','r','utf-8')
    stopwords = [line.strip() for line in file] 
    file.close()
    
    # read dataset
    ##file = codecs.open('dataset.txt','r','utf-8')
    file = codecs.open('newsgroups.json','r','utf-8')
    documents = [document.strip() for document in file] 
    file.close()
    
    word2id = {}
    id2word = {}
    docs = []
    currentDocument = []
    currentWordId = 0
    
    for document in documents:
        #  Participle
        segList = jieba.cut(document)
        for word in segList: 
            word = word.lower().strip()
            # Word length is greater than 1 and does not contain numbers and is not a stop word
            if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:
                if word in word2id:
                    currentDocument.append(word2id[word])
                else:
                    currentDocument.append(currentWordId)
                    word2id[word] = currentWordId
                    id2word[currentWordId] = word
                    currentWordId += 1
        docs.append(currentDocument);
        currentDocument = []
    return docs, word2id, id2word
    

In [9]:
# Initialization, sampling according to the multinomial distribution with equal probability of each topic, which is equivalent to taking random numbers and updating the relevant counts of the sampled topics
def randomInitialize():
	for d, doc in enumerate(docs):
		zCurrentDoc = []
		for w in doc:
			pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
			z = np.random.multinomial(1, pz / pz.sum()).argmax()
			zCurrentDoc.append(z)
			ndz[d, z] += 1
			nzw[z, w] += 1
			nz[z] += 1
		Z.append(zCurrentDoc)

In [10]:
# gibbs sampling
def gibbsSampling():
	# resample topic for each word in each document
	for d, doc in enumerate(docs):
		for index, w in enumerate(doc):
			z = Z[d][index]
			# Subtract 1 from the original topic related count of the current word in the current document
			ndz[d, z] -= 1
			nzw[z, w] -= 1
			nz[z] -= 1
			# Recalculate the probability that the current word in the current document belongs to each topic
			pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
			# Sampling according to the calculated distribution
			z = np.random.multinomial(1, pz / pz.sum()).argmax()
			Z[d][index] = z 
			# Add 1 to the newly sampled topic-related count of the current word in the current document
			ndz[d, z] += 1
			nzw[z, w] += 1
			nz[z] += 1

In [11]:
def perplexity():
	nd = np.sum(ndz, 1)
	n = 0
	ll = 0.0
	for d, doc in enumerate(docs):
		for w in doc:
			ll = ll + np.log(((nzw[:, w] / nz) * (ndz[d, :] / nd[d])).sum())
			n = n + 1
	return np.exp(ll/(-n))

In [12]:
alpha = 5
beta = 0.1	
iterationNum = 5
Z = []
K = 10
docs, word2id, id2word = preprocessing()
N = len(docs)
M = len(word2id)
ndz = np.zeros([N, K]) + alpha
nzw = np.zeros([K, M]) + beta
nz = np.zeros([K]) + M * beta
randomInitialize()
for i in range(0, iterationNum):
	gibbsSampling()
	print(time.strftime('%X'), "Iteration: ", i, " Completed", " Perplexity: ", perplexity())
 
topicwords = []
maxTopicWordsNum = 10
for z in range(0, K):
	ids = nzw[z, :].argsort()
	topicword = []
	for j in ids:
		topicword.insert(0, id2word[j])
	topicwords.append(topicword[0 : min(10, len(topicword))])

00:35:17 Iteration:  0  Completed  Perplexity:  9714.22247673278
00:36:14 Iteration:  1  Completed  Perplexity:  9712.289454574417
00:37:07 Iteration:  2  Completed  Perplexity:  9710.751329046383
00:38:10 Iteration:  3  Completed  Perplexity:  9709.177530021467
00:39:09 Iteration:  4  Completed  Perplexity:  9707.924082904106


In [14]:
topicwords

[['nm', 'ni', 've', 'mail', 'nthat', 'time', 'usa', 'public', 'david', 'nto'],
 ['ibm',
  'memory',
  'won',
  'top',
  'time',
  'tv',
  'freenet',
  'america',
  'input',
  'designed'],
 ['nlines',
  'writes',
  'article',
  'people',
  'don',
  'university',
  'host',
  'posting',
  'windows',
  'cs'],
 ['nhouston',
  'gloves',
  'ireland',
  'modifications',
  'td',
  'asserts',
  'paslawski',
  'beware',
  'resurection',
  '+-----+'],
 ['computer',
  'bit',
  'mit',
  'write',
  'nhave',
  'nby',
  'atheists',
  'cut',
  'happened',
  'acs'],
 ['nof',
  'god',
  'ndistribution',
  'power',
  'info',
  'files',
  'list',
  'time',
  'answer',
  'nnntp'],
 ['christ',
  'coding',
  'seed',
  'nt',
  'humanity',
  'wrist',
  'sspx',
  'tif',
  'computer',
  'frej'],
 ['lifetime',
  'questor',
  'nicholas',
  'plays',
  'nrespect',
  'sepinwal',
  'method',
  'crude',
  'snefru',
  'pollution'],
 ['ax',
  'nsubject',
  'norganization',
  'nthe',
  'nin',
  'nx',
  'nmax',
  'nnntp',
  