In [1]:
import jieba
import jieba.posseg as psg
import pandas as pd
import numpy as np
import re
from numba import jit
import time

In [2]:
dic_file = "word/dict.txt"
stop_file = "word/stopwords.txt"

In [28]:
def word_cut(text):
    jieba.load_userdict(dic_file) # 可以在jieba词典中添加新词
    jieba.initialize()
    try:
        stopword_list = open(stop_file,encoding ='utf-8')
    except:
        stopword_list = []
        print("error in stop_file")
    stop_list = []
    flag_list = ['n','nz','vn'] # 设定只需要名词、专有名词、动名词
    # 读取设定的停用词
    for line in stopword_list:
        line = re.sub(u'\n|\\r', '', line)
        stop_list.append(line)
    
    word_list = []
    #分词
    seg_list = psg.cut(text)
    for seg_word in seg_list:
        word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word)
        find = 0
        # 不记录停用词以及小于两个字的词
        if word in stop_list or len(word) < 2:
            continue
        if seg_word.flag in flag_list:
            word_list.append(word) 
    return word_list

def text_handling(data):
    freq = {}
    word_list=[]
    for i in range(len(data)):
        word_list.append([])
        word_list[-1] = word_cut(data[i])
        for word in word_list[i]:
            if word in freq.keys():
                freq[word] += 1
            else:
                freq[word] = 1
    return word_list, freq

In [23]:
class LDATopicModel:
    
    def __init__(self, data, topic_num, alpha, beta):
        self.data = data
        self.text_num = len(data)
        self.topic_num = topic_num
        self.alpha = alpha
        self.beta = beta
        self.word_map = None
        self.word_list = None
        self.word_idx = None


    def preprocess(self, freq, drop_num):
        all_words = []
        
        for i in range(self.text_num):
            all_words = all_words + self.data[i]
            
        unique_words = list(set(all_words))
        unique_words.sort(key=lambda wd: freq[wd], reverse=True)
        self.word_list = unique_words[drop_num:]
        
        self.word_idx = {}
        for t, wd in enumerate(self.word_list):
            self.word_idx[wd] = t
            
        self.word_map = []
        for word_line in self.data:
            self.word_map.append([])
            for word in word_line:
                if word in self.word_list:
                    self.word_map[-1].append(self.word_idx[word])

    def fit(self, epochs):

        word_num = len(self.word_list)
        # 设置超参
        alpha = self.alpha * np.ones(self.topic_num)
        beta = self.beta * np.ones(word_num)
        # 初始化参数
        nd = np.zeros((self.text_num, self.topic_num)).astype(np.int64)
        mk = np.zeros((self.topic_num, word_num)).astype(np.int64)
        z  = [np.random.randint(0, self.topic_num, len(self.word_map[i])) for i in range(0, self.text_num)]
        new_z = [np.zeros(len(self.word_map[i])).astype(np.int64) for i in range(0, self.text_num)]
        # 统计变量
        for i, wordlist in enumerate(self.word_map):
            mk,nd = self.Gibbs_0(np.array(wordlist),mk,nd,i,np.array(z[i]))

        # 吉布斯采样
        for epoch in range(1, epochs+1):
            if epoch % 5 == 0:
                print('--第',epoch,'次迭代--')
            
            temp = (np.sum(mk, axis=1) + np.sum(beta))*(np.sum(nd) + np.sum(alpha))
            for i, wordlist in enumerate(self.word_map):
                new_z[i] = list(self.Gibbs_1(np.array(wordlist),temp,mk,beta,alpha,nd,i,np.array(new_z[i])))
                
            for i, wordlist in enumerate(self.word_map):
                mk,nd,z[i] = self.Gibbs_2(np.array(wordlist),mk,nd,np.array(z[i]),i,np.array(new_z[i]))

        # 结果
        self.phi = mk + np.broadcast_to(beta, (self.topic_num, word_num))
        self.phi = self.phi / np.transpose(np.broadcast_to(np.sum(self.phi, axis=1), (word_num, self.topic_num)))

        self.theta = nd + np.broadcast_to(alpha, (self.text_num, self.topic_num))
        self.theta = self.theta / np.transpose(np.broadcast_to(np.sum(self.theta, axis=1), (self.topic_num, self.text_num)))

            
    def topics_words(self, word_num):
        # 返回每个主题的前n个词语
        topic_word = np.argsort(-self.phi, axis=1)
        top_words = [[]] * self.topic_num
        for i in range(self.topic_num):
            top_words[i] = []
            for j in range(word_num):
                top_words[i].append(self.word_list[topic_word[i][j]])
        return top_words


    def text_topics(self, n):
        return np.argsort(-self.theta, axis=1)[:,:n]
    
    @staticmethod
    @jit(nopython=True)
    def Gibbs_0(wordlist,mk,nd,i,z):
        for j, word in enumerate(wordlist):
            nd[i, z[j]] += 1
            mk[z[j], word] += 1
        return mk, nd
    
    @staticmethod
    @jit(nopython=True)
    def Gibbs_1(wordlist,temp,mk,beta,alpha,nd,i,new_z):
        for j, word in enumerate(wordlist):
            p = (mk[:,word]+beta[word])*(nd[i,:]+alpha) / temp
            p = p / np.sum(p)
            if not (p > 0.).all():
                print(p)
            new_z[j] = np.argmax(np.random.multinomial(1, p))
        return new_z
    
    @staticmethod
    @jit(nopython=True)
    def Gibbs_2(wordlist, mk, nd, z, i, new_z):
        for j, word in enumerate(wordlist):
            mk[z[j], word] -= 1
            mk[new_z[j], word] += 1
            nd[i, z[j]] -= 1
            nd[i, new_z[j]] += 1
            z[j] = new_z[j]
        return mk, nd, z

In [5]:
data=pd.read_excel("data.xlsx")

In [29]:
word_list, freq = text_handling(data['content'])

In [37]:
lda = LDATopicModel(data=word_list,topic_num=8,alpha=0.3,beta=0.3)
start = time.time()
lda.preprocess(freq=freq,drop_num=0)
end1 = time.time()
lda.fit(epochs=50)
result = lda.topics_words(word_num=15)
end2 = time.time()
print("学习结束，预处理用时{}s, 迭代用时{}s。".format(end1-start,end2-end1))

--第 5 次迭代--
--第 10 次迭代--
--第 15 次迭代--
--第 20 次迭代--
--第 25 次迭代--
--第 30 次迭代--
--第 35 次迭代--
--第 40 次迭代--
--第 45 次迭代--
--第 50 次迭代--
学习结束，预处理用时2.664484977722168s, 迭代用时3.153074026107788s。


In [38]:
for i in range(8):
    print("Topic",i+1,":",result[i])

Topic 1 : ['游戏', '票房', '玩家', '公司', '电子竞技', '研究', '世界', '科学家', '任务', '航天飞机', '网游', '太空', '补偿', '技术', '消息']
Topic 2 : ['专家', '走势', '分析', '突破', '黄金', '股票', '大盘', '成本', '公司', '市场', '趋势', '股市', '风险', '后市', '投资']
Topic 3 : ['主持人', '电影', '比赛', '现场', '电子竞技', '总决赛', '观众', '演员', '项目', '战队', '赛区', '赛事', '冠军', '游戏', '影片']
Topic 4 : ['手机', '网站', '网游', '用户', '互联网', '政策', '电影', '公司', '网络', '视频', '预测', '市场', '业务', '文章', '票房']
Topic 5 : ['主队', '赔率', '市场', '海选', '公司', '客胜', '投资', '项目', '政策', '客队', '流感', '交通', '足彩', '博彩', '主场']
Topic 6 : ['比赛', '球队', '主场', '火箭', '球员', '客场', '篮板', '奇才', '联赛', '体育讯', '助攻', '本场', '湖人', '训练', '内线']
Topic 7 : ['项目', '建筑', '地产', '空间', '产品', '投资', '设计', '公司', '文化', '别墅', '市场', '国际', '房子', '主持人', '企业']
Topic 8 : ['学生', '大学', '电影', '学校', '专业', '国家', '移民', '教育', '孩子', '国际', '留学生', '研究', '记者', '影片', '银行']
