In [None]:
import pandas as pd
import jieba
from jieba import posseg
from jieba import analyse
import pyLDAvis 
import pyLDAvis.sklearn
import ipywidgets
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import gensim
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary
from gensim.models import AuthorTopicModel
import matplotlib.pyplot as plt 
from wordcloud import WordCloud
from collections import Counter
from pylab import mpl
import nltk
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
# import codecs
# from textrank4zh import TextRank4Keyword, TextRank4Sentence
jieba.add_word('pm2.5')

In [None]:
data1 = pd.read_csv(open('大气.csv', encoding='utf-8')).astype(str).head(1000)
data1.columns=['作者', '文章名称', '关键词', '摘要', '发表位置', '时间', '单位或其他', '下载量', '被引量'] 
data2 = pd.read_csv(open('水环境.csv', encoding='utf-8')).astype(str).head(1000)
data2.columns=['作者', '文章名称', '关键词', '摘要', '发表位置', '时间', '单位或其他', '下载量', '被引量'] 
data3 = pd.read_csv(open('土环境.csv', encoding='utf-8')).astype(str).head(1000) 
data3.columns=['作者', '文章名称', '关键词', '摘要', '发表位置', '时间', '单位或其他', '下载量', '被引量'] 

In [None]:
data1.to_csv('1.csv', encoding="utf_8_sig")
data2.to_csv('2.csv', encoding="utf_8_sig")
data3.to_csv('3.csv', encoding="utf_8_sig")

In [None]:
data = pd.concat([data1,data2,data3],axis=0, ignore_index=True)
data

In [None]:
data = data.drop(data[data['作者'] == 'nan'].index| data[data['关键词'] == 'nan'].index)
data = data.drop_duplicates().reset_index()
data

In [None]:
# 处理单一文本分词
with open(r'hit_stopwords.txt', 'r', encoding='utf-8') as f:
    stop = [word.strip('\n') for word in f.readlines()]
def chinese_word_cut(text):
    text = text.replace('nan', '')
    text = text.replace('\n', '')
    l = []
    pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd', 'x', 'eng', 'ns']  # 定义选取的词性
    text = text.lower()
    seg = posseg.cut(text)  # 分词
    for i in seg:
        if i.word not in stop and i.flag in pos:  # 去停用词 + 词性筛选
            l.append(i.word)
    return ' '.join(l)

In [None]:
def key_word_cut(text):
    text = text.replace('\n', '')
    text = text.split('/')
    return ' '.join(text)

In [None]:
data['key_words'] = data.关键词.apply(key_word_cut)
data['content_cutted'] = data.摘要.apply(chinese_word_cut)

In [None]:
def date_process(text):
    text = text.replace('\n', '')
    text = text.replace('\t', '')
    text = text[:4]
    return text
data['date'] = data.时间.apply(date_process)

In [None]:
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')

In [None]:
data['2015']

In [None]:
cnt = Counter((' '.join((data.key_words).tolist()).split()))
Counter(cnt).most_common()

In [None]:
# 按年份统计发文数量
count = []
years = []
for year in range(2014, 2021):
    years.append(year)
    cnt = list(set(data[str(year)].count().values))[0]
    count.append(int(cnt))
plt.bar(years,count,facecolor='g',edgecolor='r')
plt.show()

In [None]:
# 统计词频，选取了前20个关键字  作为主题
cnt = Counter((' '.join((data.key_words).tolist()).lower().split()))
topic = dict(Counter(cnt).most_common(20))
# 以关键词第一个为主题进行绘制的（后期需要哪方面可以改）
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.figure(figsize=(5,5))
plt.pie(topic.values(),radius=1,wedgeprops=dict(width=0.4,edgecolor='w'),labels=topic.keys())
plt.show()

In [None]:
# 自定义关键字  按年份统计发文数量
%matplotlib inline
toplist = ['pm2.5',  '空气质量', '大气污染', '雾霾', '细颗粒物', '水污染']
topicdict = {}
years = []
for year in range(2014, 2021):
    years.append(year)
    cnt = Counter((' '.join((data[str(year)].key_words).tolist()).lower().split()))
    cnt
    topic = dict(Counter(cnt).most_common())
    for word in toplist:
        if word not in topicdict.keys():
            topicdict[word] = []
        try:
            topicdict[word].append(topic[word])
        except:
            topicdict[word].append(0)

plt.figure(figsize=(15,7))
for word in topicdict.keys():
    plt.plot(years,topicdict[word],label=word,linewidth=2.0,linestyle='--', )
plt.legend()
plt.show()

In [None]:
qikan = data['发表位置'].value_counts()[:10]
name = qikan.index.tolist()
value = qikan.values
plt.bar(name,value ,facecolor='g',edgecolor='r')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 根据关键字查找 
def get_paper(label):
    return data[data['key_words'].str.contains(label)]['文章名称']

In [None]:
get_paper('空气污染')

In [None]:
# 进行textrank分析 同样可以按年份、或者批量的文章来设置 分析关键词
keywords = jieba.analyse.textrank(' '.join((data.key_words).tolist()), topK=100, withWeight=True, allowPOS=('n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd', 'x', 'eng'))  # TextRank关键词提取，词性筛选
# word_split = " ".join(keywords)
# print (word_split)
df = pd.DataFrame.from_dict(dict(keywords), orient='index').reset_index()
df.columns = ['key_words', 'weights']
df

In [None]:
# 进行textrank分析 同样可以按年份、或者批量的文章来设置 分析摘要
keywords = jieba.analyse.textrank(' '.join((data.content_cutted).tolist()), topK=100, withWeight=True, allowPOS=('n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd', 'x', 'eng'))  # TextRank关键词提取，词性筛选
# word_split = " ".join(keywords)
# print (word_split)
df = pd.DataFrame.from_dict(dict(keywords), orient='index').reset_index()
df.columns = ['key_words', 'weights']
df

In [None]:
result = ' '.join((data.key_words).tolist())
wc = WordCloud(font_path=r'msyh.ttf',background_color='white',width=800,height=600,max_font_size=100,  
               max_words=100, regexp='(?u)\\w+\\.\\w+|\\w\\w+',collocations=False )#,min_font_size=10)#,mode='RGBA',colormap='pink')  

wc.generate(result)  
wc.to_file(r"key_words.png") #按照设置的像素宽高度保存绘制好的词云图，比下面程序显示更清晰  
  
# 4、显示图片  
plt.figure(figsize=(16,12))
plt.figure("词云图") #指定所绘图名称  
plt.imshow(wc)       # 以图片的形式显示词云  
plt.axis("off")      #关闭图像坐标系  
plt.show() 

In [None]:
result = ' '.join((data.content_cutted).tolist())
wc = WordCloud(font_path=r'msyh.ttf',background_color='white',width=800,height=600,max_font_size=100,  
               max_words=100, regexp='(?u)\\w+\\.\\w+|\\w\\w+',collocations=False )#,min_font_size=10)#,mode='RGBA',colormap='pink')  

wc.generate(result)  
wc.to_file(r"key_words.png") #按照设置的像素宽高度保存绘制好的词云图，比下面程序显示更清晰  
  
# 4、显示图片  
plt.figure(figsize=(16,12))
plt.figure("词云图") #指定所绘图名称  
plt.imshow(wc)       # 以图片的形式显示词云  
plt.axis("off")      #关闭图像坐标系  
plt.show() 

In [None]:
#    kmeans文本聚类，效果一般，数据相关性比较大 改用文本分类可检索
# n_features = 50
# tf_vectorizer = CountVectorizer(strip_accents='unicode',
#                                 max_features=n_features,
#                                 token_pattern='(?u)\\b\\w+\\.\\w+\\b|(?u)\\b\\w\\w+\\b')
# tfidfv = TfidfTransformer()
# tf = tf_vectorizer.fit_transform(data.key_words)
# tfidf = tfidfv.fit_transform(tf)
# word = tf_vectorizer.get_feature_names() #获取词袋模型中的所有词语
# weight = tfidf.toarray()

# inter = {}
# for num in range(3, 100, 3):
#     num_clusters = num 

#     km = KMeans(n_clusters=num_clusters, max_iter=1000, precompute_distances=True)

#     km.fit(weight)

#     clusters = km.labels_.tolist()
#     # 样本距其最近的聚类中心的平方距离之和，用来评判分类的准确度，值越小越好
#     # k-means的超参数n_clusters可以通过该值来评估
#     print("inertia: {}".format(km.inertia_))
#     inter[num] = km.inertia_
# print(inter)
# 聚类越多，值越小   此处还有问题，可尝试采用文本分类  手动标签

In [None]:
# lda根据keywords分析文章主题       同样可根据时间或其他方式分类进行    
doclist = data.key_words.values
doclist
texts = [[word for word in doc.split()] for doc in doclist]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

model_list = []

for i in range(3):
    lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3,passes = 10,iterations=1000, random_state=i)
    top_topics = lda_gensim.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((lda_gensim, tc))

# 模型评估:主题一致性    
print(model_list)
lda_gensim, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

In [None]:
import pyLDAvis.gensim
print(lda_gensim.print_topics(num_topics=10, num_words=5))
pyLDAvis.show(pyLDAvis.gensim.prepare(lda_gensim, corpus, dictionary))

In [None]:
# lda根据摘要分析文章主题       同样可根据时间或其他方式分类进行
doclist = data.content_cutted.values
doclist
texts = [[word for word in doc.split()] for doc in doclist]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model_list = []

for i in range(3):
    lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3,passes = 10,iterations=1000, random_state=i)
    top_topics = lda_gensim.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((lda_gensim, tc))

# 模型评估:主题一致性    
print(model_list)
lda_gensim, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

In [None]:
import pyLDAvis.gensim
print(lda_gensim.print_topics(num_topics=10, num_words=5))
pyLDAvis.show(pyLDAvis.gensim.prepare(lda_gensim, corpus, dictionary))

In [None]:
# tf_vectorizer = CountVectorizer(strip_accents='unicode',
#                                 max_features=n_features,
#                                 token_pattern='(?u)\\b\\w+\\.\\w+\\b|(?u)\\b\\w\\w+\\b')
# tf = tf_vectorizer.fit_transform(data.key_words)
# tf_features_names = tf_vectorizer.get_feature_names()
# n_topics = 3
# lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1000,
#                                     random_state=0)
# lda.fit(tf)

In [None]:
# tf_vectorizer = CountVectorizer(strip_accents='unicode',
#                                 max_features=n_features,
#                                 token_pattern='(?u)\\b\\w+\\.\\w+\\b|(?u)\\b\\w\\w+\\b')
# tf = tf_vectorizer.fit_transform(data.content_cutted)
# tf_features_names = tf_vectorizer.get_feature_names()
# n_topics = 3
# lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1000,
#                                     random_state=0)
# lda.fit(tf)

In [None]:
# data.to_csv('1.csv', encoding="utf_8_sig")

In [None]:
doc_complete=data.key_words.values
texts = [[word for word in doc.split()] for doc in doc_complete]
dictionary_aut = corpora.Dictionary(texts)
    # 使用上面的词典，将转换文档列表（语料）变成 DT 矩阵
doc_term_matrix = [dictionary_aut.doc2bow(text) for text in texts]

In [None]:
aut_name = data.作者.tolist()
author_name=set()
author_list=[]
author2doc={}
count=0
for line in aut_name:
    for name in line.split('/'):
        if name not in author_name:
            author_name.add(name)
            author_list=[]
            author_list.append(count)
            author2doc[name]=author_list
        else:
            author2doc[name].append(count)
        
    count = count + 1
    
author2doc

In [None]:
mi = dict(zip(dictionary_aut.token2id.values(), dictionary_aut.token2id.keys()))
# model = AuthorTopicModel(corpus=doc_term_matrix,author2doc=author2doc,num_topics=3 , id2word=mi, passes=10, iterations=2000, random_state=12)
# model.update(doc_term_matrix, author2doc)
# author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
# print(author_vecs)
# model = AuthorTopicModel(corpus=doc_term_matrix, num_topics=3, id2word=mi, \
#                 author2doc=author2doc, chunksize=2000, passes=1, eval_every=0, \
#                 iterations=1, random_state=1)

# 模型选择
#如果你觉得模型没有收敛，那么可以采用增量训练,`model.update(corpus, author2doc)`。
#通过设置该参数random_state,不同的随机种子,并选择具有最高主题一致性的模型。
model_list = []
for i in range(1):
    model = AuthorTopicModel(corpus=doc_term_matrix, num_topics=3, id2word=mi, \
                    author2doc=author2doc, passes=1, gamma_threshold=1e-10, \
                    eval_every=0, iterations=10, random_state=i)
    top_topics = model.top_topics(doc_term_matrix)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))

# 模型评估:主题一致性    
print(model_list)
model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

In [None]:
# from gensim.models import atmodel
# doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

# # Compute the per-word bound.
# # Number of words in corpus.
# corpus_words = sum(cnt for document in model.corpus for _, cnt in document)

# # Compute bound and divide by number of words.
# perwordbound = model.bound(model.corpus, author2doc=model.author2doc, \
#                            doc2author=model.doc2author) / corpus_words
# print(perwordbound)

# # 话题一致性指标计算 
# top_topics = model.top_topics(model.corpus)

In [None]:
# 自定义每个主题主要内容
topic_labels = ['农药', '环境污染', 'pm2.5']

# 查看每个主题下都有哪些词语
for topic in model.show_topics(num_topics=3):
    print('Label: ' + topic_labels[topic[0]])
    words = ''
    for word, prob in model.show_topic(topic[0]):
        words += word + ' '
    print('Words: ' + words)
    print()


from pprint import pprint

def show_author(name):
    print('\n%s' % name)
    print('Docs:', model.author2doc[name])
    print('Topics:')
    pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

# 作者的主要文章有哪些，话题有那个
show_author('朱琳')

In [None]:
print(model['朱琳'])
model.get_author_topics('朱琳')

In [None]:
# # 相似作者推荐
# from gensim.similarities import MatrixSimilarity
# import numpy as np

# # Generate a similarity object for the transformed corpus.
# index = MatrixSimilarity(model[list(model.id2author.values())])

# # Get similarities to some author.
# author_name = '朱琳'
# sims = index[model[author_name]]
# idxs = np.argsort(sims)[-11:-1]
# for idx in idxs.tolist():
#     print(model.id2author[idx], sims[idx], model[model.id2author[idx]] )

In [None]:
# 相似作者
from gensim import matutils
import pandas as pd

# Make a list of all the author-topic distributions.
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    '''Get similarity between two vectors'''
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    '''Get similarity of vector to all authors.'''
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    '''
    Get table with similarities, author names, and author sizes.
    Return `top_n` authors as a dataframe.

    '''

    # Get similarities.
    sims = get_sims(model.get_author_topics(name))

    # Arrange author names, similarities, and author sizes in a list of tuples.
    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))

    # Make dataframe and retrieve top authors.
    df = pd.DataFrame(table, columns=['作者', '相似度', '文档数量'])
    df = df.sort_values('相似度', ascending=False)[:top_n]

    return df
get_table('朱琳', smallest_author=3)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

# Tell Bokeh to display plots inside the notebook.
from bokeh.io import output_notebook

output_notebook()

from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

# Radius of each point corresponds to the number of documents attributed to that author.
scale = 0.1
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [None]:
mpl.rcParams['font.sans-serif'] = ['SimHei']
ntext = nltk.Text((' '.join(data['content_cutted'].tolist())).split())
print(ntext.dispersion_plot(['污染', '大气', '颗粒物', 'pm2.5']))