In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import swifter
import numpy as np

In [2]:
stop_words = list(set(stopwords.words('english')))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[{}0-9]'.format(string.punctuation), ' ', text)
    text=re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [3]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df=pd.DataFrame({"content":newsgroups["data"]})

df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df['content_length'] = df['content'].str.len()

df = df[df['content_length'] > 100]
df = df[df['content_length'] < 2000]

df=df[["content"]].reset_index(drop=True).reset_index().rename(columns={"index":"id"})
documents_20newsgroup=df.content.to_list()

Pandas Apply:   0%|          | 0/18846 [00:00<?, ?it/s]

In [3]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def count_remover(text,threshold=4):
    if len(text.split())<threshold:
        return pd.NaT
    else:
        return text

data=pd.read_json('tweets.json' ,lines=True)
df=data[["Text","CreatedAt"]].rename(columns={"Text":"content","CreatedAt":"time"})
df['content'] = df['content'].str.replace(r'@\w+', '')
df['content'] = df['content'].apply(lambda x: re.sub(r"http\S+", "", x))
df['content'] = df['content'].apply(lambda x: remove_punct(x))
df['content'] = df['content'].apply(lambda x: count_remover(x))
df=df.dropna()
df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df=df.dropna()
documents_EM_tweets=df.content.tolist()

Pandas Apply:   0%|          | 0/14268 [00:00<?, ?it/s]

In [4]:
import time
import jieba

def preprocessing(documents):

    word2id = {}
    id2word = {}
    docs = []
    currentDocument = []
    currentWordId = 0

    for document in documents:
        segList = jieba.cut(document)
        for word in segList:
            word = word.lower().strip()
            # 单词长度大于1并且不包含数字并且不是停止词
            if len(word) > 1 and not re.search('[0-9]', word) and word not in stop_words:
                if word in word2id:
                    currentDocument.append(word2id[word])
                else:
                    currentDocument.append(currentWordId)
                    word2id[word] = currentWordId
                    id2word[currentWordId] = word
                    currentWordId += 1
        docs.append(currentDocument)
        currentDocument = []
    return docs, word2id, id2word

# 初始化，按照每个topic概率都相等的multinomial分布采样，等价于取随机数，并更新采样出的topic的相关计数
def randomInitialize():
	for d, doc in enumerate(docs):
		zCurrentDoc = []
		for w in doc:
			pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
			z = np.random.multinomial(1, pz / pz.sum()).argmax()
			zCurrentDoc.append(z)
			ndz[d, z] += 1
			nzw[z, w] += 1
			nz[z] += 1
		Z.append(zCurrentDoc)

# gibbs采样
def gibbsSampling():
	# 为每个文档中的每个单词重新采样topic
	for d, doc in enumerate(docs):
		for index, w in enumerate(doc):
			z = Z[d][index]
			# 将当前文档当前单词原topic相关计数减去1
			ndz[d, z] -= 1
			nzw[z, w] -= 1
			nz[z] -= 1
			# 重新计算当前文档当前单词属于每个topic的概率
			pz = np.divide(np.multiply(ndz[d, :], nzw[:, w]), nz)
			# 按照计算出的分布进行采样
			z = np.random.multinomial(1, pz / pz.sum()).argmax()
			Z[d][index] = z
			# 将当前文档当前单词新采样的topic相关计数加上1
			ndz[d, z] += 1
			nzw[z, w] += 1
			nz[z] += 1

def perplexity():
	nd = np.sum(ndz, 1)
	n = 0
	ll = 0.0
	for d, doc in enumerate(docs):
		for w in doc:
			ll = ll + np.log(((nzw[:, w] / nz) * (ndz[d, :] / nd[d])).sum())
			n = n + 1
	return np.exp(ll/(-n))

In [5]:
docs, word2id, id2word = preprocessing(documents_EM_tweets)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/db/frmp2tzj72s37hdwvvdzwr7m0000gn/T/jieba.cache
Loading model cost 0.345 seconds.
Prefix dict has been built successfully.


In [9]:
alpha = 5
beta = 0.1
iterationNum = 50
Z = []
K = 30

N = len(docs)
M = len(word2id)
ndz = np.zeros([N, K]) + alpha
nzw = np.zeros([K, M]) + beta
nz = np.zeros([K]) + M * beta
randomInitialize()
for i in range(0, iterationNum):
	gibbsSampling()
	print(time.strftime('%X'), "Iteration: ", i, " Completed", " Perplexity: ", perplexity())

topicwords = []
maxTopicWordsNum = 10
for z in range(0, K):
	ids = nzw[z, :].argsort()
	topicword = []
	for j in ids:
		topicword.insert(0, id2word[j])
	topicwords.append(topicword[0 : min(10, len(topicword))])

import pickle

with open("results/lda_gibbs_EM_tweets_30_topics", "wb") as fp:   #Pickling
     pickle.dump(topicwords, fp)

10:54:17 Iteration:  0  Completed  Perplexity:  3581.7894045866046
10:54:18 Iteration:  1  Completed  Perplexity:  3580.1305484179575
10:54:19 Iteration:  2  Completed  Perplexity:  3579.588090697878
10:54:20 Iteration:  3  Completed  Perplexity:  3577.695454761753
10:54:22 Iteration:  4  Completed  Perplexity:  3577.728074228838
10:54:23 Iteration:  5  Completed  Perplexity:  3575.9518054147943
10:54:24 Iteration:  6  Completed  Perplexity:  3576.6773707918
10:54:25 Iteration:  7  Completed  Perplexity:  3577.083199278167
10:54:26 Iteration:  8  Completed  Perplexity:  3576.406269769372
10:54:27 Iteration:  9  Completed  Perplexity:  3575.2587023215215
10:54:28 Iteration:  10  Completed  Perplexity:  3574.3248961222007
10:54:30 Iteration:  11  Completed  Perplexity:  3573.460412861076
10:54:31 Iteration:  12  Completed  Perplexity:  3571.7440176716495
10:54:32 Iteration:  13  Completed  Perplexity:  3571.743058493773
10:54:33 Iteration:  14  Completed  Perplexity:  3572.962261402745
1