## （以自己为例，中文全流程总结）



### 1. 预处理之分词

In [None]:
import jieba
import jieba.posseg as pseg
import re

In [None]:
jieba.load_userdict('your_userdict_txt')
stopwords = open('your_stopwords_txt', 'r', encoding = 'utf8').readlines()
stopwords = [w.strip() for w in stopwords]

In [None]:
# 进行分词

tr = []

fr = open('your_txt_to_process','r',encoding='utf-8')
for w in fr.readlines():
    w = w.strip()      # 移除字符串首尾指定的字符
    w = "".join(w.split())
    if not len(w):                    # 看是否是空行
        continue
        
    outstr = ''                          # 给一个字符串
    
    w = re.sub(r'[A-Za-z0-9]|/d+','',str(w)) # 正则一下
    
    seg_list = jieba.lcut(w, cut_all=False)   # 精确结巴
    for word in seg_list:
        if word not in stopwords:            # 看是否在stopwords中
            if word != '\t':                 # ！= 不是制表符
                outstr += word
                outstr += " "
            
    tr.append(outstr.strip().split(" "))    # 往这个list里取出空白且分割

In [None]:
# 把结果写入

with open('your_out_txt','w',encoding = 'utf-8') as file:
    file.write(str(tr))

### 结巴自带算法看下关键词

In [None]:
from jieba import analyse
tfidf = analyse.extract_tags

In [None]:
with open('your_out_txt','r',encoding='utf-8') as file:
    texts = file.readlines()
keywords = jieba.analyse.extract_tags(str(texts), topK=50, withWeight=True, allowPOS=('nr','ns','nt','nz','n','vn','v'))

file.close()

### 2. 导入gensim 建 LDA

In [None]:
from gensim import corpora, models, similarities
from gensim.models import LdaModel
from gensim.models import ldaseqmodel
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary, bleicorpus
import pyLDAvis.gensim
import numpy
from gensim.matutils import hellinger

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager

%matplotlib inline
matplotlib.rcParams['font.sans-serif'] = ['Simhei']
plt.rcParams.update({'font.size': 22})

In [None]:
with open('your_out_txt','r',encoding='utf-8') as file:
    texts = file.readlines()

In [None]:
# 那个结果的txt拆出来变成list of words 

import ast

tr = [inner for item in texts for inner in ast.literal_eval(item)] 

#### 选择合适的topic k

In [None]:
dictionary = corpora.Dictionary(tr)
corpus = [dictionary.doc2bow(text) for text in tr]
time_slice = [] # 如果要做dtm，后面就省略在这里写了

In [None]:
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=tr, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)

In [None]:
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model, texts=tr, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=tr, start=2, limit=40, step=6)

# Show graph

limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

#### 可视化LDA

In [None]:
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=k)

In [None]:
pyLDAvis.enable_notebook()
vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

In [None]:
# save html
pyLDAvis.save_html(vis_data, 'your_output.html')

#### dtm 动态

In [None]:
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=k)

In [None]:
import pandas as pd

# 打印出每个时期的k个主题并拼接（因为我不会写循环哈哈哈哈，所以超级笨的办法）

frame1 = pd.DataFrame(ldaseq.print_topics(time=0))
frame2 = pd.DataFrame(ldaseq.print_topics(time=1))
frame3 = pd.DataFrame(ldaseq.print_topics(time=2))

In [None]:
frames = [frame1,frame2, frame3]
project_dtm_result = pd.concat(frames)

In [None]:
# 保存结果在csv里

project_dtm_result. to_csv('paperDtmResult1.csv', encoding = 'utf_8_sig')

### 3. Yellowbrick可视化

#### Token frequency

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

from yellowbrick.text import FreqDistVisualizer

vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(texts)
features = vectorizer.get_feature_names()

In [None]:
visualizer = FreqDistVisualizer(features=features, n=30, orient='v', size = (700,480), color = 'k')
visualizer.fit(docs)
plt.xticks(rotation=60)
plt.tick_params(labelsize = 16)
visualizer.show()

#### Dispersion plot


In [None]:
from yellowbrick.text import DispersionPlot

In [None]:
# for example

target_words = ['城市','路灯','大数据','农业', '物流']
visualizer = DispersionPlot(target_words)
visualizer.fit(tr)
visualizer.show()