In [1]:
import csv
import os
from collections import Counter
from random import seed

import gensim
import jieba
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, ldamodel
from pip import main
from scipy import stats
from tqdm import tqdm

In [None]:
np.random.seed(seed=23)


def read_dict_corpus():
    print("读取文件中 >>> >>>")
    data = pd.read_csv("all_text.csv")["seg"].unique()
    data = [seg.split() for seg in np.random.choice(data, 10000000)]
    print("文本量：", len(data))
    # 取得词表
    stw = set([line.strip() for line in open('stw_2000.txt', encoding='utf-8')])
    print("读取完成！")
    
    dictionary = Dictionary()

    with open("data/train_data_2022.04.30.txt", "w") as f:
        for doc in tqdm(data):
            words = [w for w in doc if w not in stw and len(w) > 1]
            if len(words) > 0:
                f.write(" ".join(words) + "\n")
                dictionary.add_documents([words])
                      
    dictionary.filter_extremes(no_below=100, no_above=0.5, keep_n=50000)
    dictionary.save_as_text("data/dict_2022.04.30.txt")

    # 取得语料库
    corpus = [(i, dictionary.doc2bow(doc)) for i, doc in enumerate(data)]
    corpus = [bow for _, bow in corpus if bow]
    print(len(corpus), len(corpus))

    del data
    #建立TF-IDF模型
    tfidf = models.TfidfModel(corpus)
    #使用TF-IDF值为词项进行加权
    corpus = tfidf[corpus]
    # corpus_tfidf
    
    return dictionary, corpus


def my_lda(dictionary, corpus, num_t):
    """_summary_

    Args:
        dictionary (_type_): 词典
        corpus (_type_): 语料库
        num_t (_type_): 主题数
    """
    print("lda training ... # of Topics:", num_t)
    lda_model = ldamodel.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_t,
        passes=2,
        alpha='auto',
        eta='auto')
    # 保存训练好的lda
    lda_model.save(f'model/lda_{num_t}.model')
    print("保存模型！")
    # 调用保存好的lda模型
    # lda_model = models.ldamodel.LdaModel.load(f'model/lda_{num_t}.model')

    # 将主题、词和概率保存到csv文件
    with open(f"data/topic_{num_t}.csv", "w", encoding="utf-8", newline='') as csvfile:
        fieldnames = ["topic_id", "term", "prob"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for topic_id in range(num_t):
            term_probs = lda_model.show_topic(topic_id, topn=50)
            for term, prob in term_probs:
                row = {}
                row['topic_id'] = topic_id
                row['prob'] = prob
                row['term'] = term
                writer.writerow(row)
            
            

def apply_my_lda(data_pth, lda_model_pth):
    lda = models.ldamodel.LdaModel.load('model/lda_6.model')
    dictionary = Dictionary.load_from_text("data/dict_2022.04.30.txt")
    
    df = pd.read_csv(data_pth)
    seg = df['seg']
    blog_time = df['blog_time'][0]
    result = []
    
    for i in range(len(seg)):
        test_doc = seg[i]
        test_doc = [word for word in test_doc.split()]
        # 文本转换成bow
        doc_bow = dictionary.doc2bow(test_doc)
        doc_lda = lda.get_document_topics(doc_bow)
        doc_lda = dict(doc_lda)
        # 找到最大概率对应的主题
        doc_topic = max(doc_lda, key=doc_lda.get)
        result.append(doc_topic)

    # print(result)
    result_dict = Counter(result)
    result_dict = {
        'date': blog_time,
        'topic0': result_dict[0],
        'topic1': result_dict[1],
        'topic2': result_dict[2],
        'topic3': result_dict[3],
        'topic4': result_dict[4],
        'topic5': result_dict[5]
    }

    target = pd.DataFrame(result_dict, index=[-1])
    return target

    
if __name__ == "__main__":
    # dictionary, corpus = read_dict_corpus()
    # my_lda(dictionary, corpus, 4)
    # my_lda(dictionary, corpus, 6)

    ## 调用保存好的lda模型
    base_pth = 'data/break_text/'
    csv_list = os.listdir(base_pth)
    csv_list = [base_pth + i for i in csv_list]

    target_csv = pd.DataFrame(columns=['date', 'topic0', 'topic1', 'topic2', 'topic3', 'topic4', 'topic5'])

    for _csv in csv_list:
        # try:
        #    print(f"正在执行:{_csv}")
        #    temp_csv = apply_my_lda(_csv, "model/lda_6.model")
        #    target_csv = target_csv.append(temp_csv)
        #except Exception as e:
        #    print(f"发生错误：{e}")
        print(f"正在执行:{_csv}")
        temp_csv = apply_my_lda(_csv, "model/lda_6.model")
        target_csv = target_csv.append(temp_csv)
    
    target_csv.to_csv('data/lda_classification.csv', index=False)