In [1]:
import os
import jieba
import numpy as np
import pandas as pd
import heapq
import time
from functools import reduce
from sklearn.decomposition  import LatentDirichletAllocation 
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def nlargest(array: np.ndarray, n: int) -> list:
    """
    找一个数组中前K大的数，当然是堆排序啦
    """
    return array.argsort()[::-1][:n]

def find_doc_from_verb(feature_names, doc2verb_matrix, titles, i):
    word = feature_names[i]
    print('word: {0} \n'.format(word))
    word_vec = doc2verb_matrix[:, i]
    largest = nlargest(word_vec, 20)
    s = set()
    ten_largest = []
    for n in largest:
        if titles[n] not in s:
            s.add(titles[n])
            ten_largest.append(titles[n])
        if len(ten_largest) == 11:
            break
    for j in ten_largest:
        print(j)

In [3]:
%%time
excel = pd.ExcelFile(r'.\..\..\data\爬虫数据.xlsx')
sheets = excel.sheet_names
docs = []
for name in sheets:
    doc = {'name': name, 'title': [], 'texts': []}
    df = excel.parse(name)
    for _, row in df.iterrows():
        if type(row.desc) is str:
            doc['title'].append(row.title)
            doc['texts'].append(row.desc)
    docs.append(doc)

Wall time: 1.32 s


In [4]:
%%time
# load stopwords
stopwords_dir_path = r'C:\Users\zjxua\GitHub\CAS-NLP\data\stopwords'
stopwords_filelist = [os.path.join(stopwords_dir_path, p) for p in os.listdir(stopwords_dir_path)]
stopwords = list(reduce(lambda x, y: x + y, [open(p).read().split('\n') for p in stopwords_filelist]))

Wall time: 1.5 ms


In [5]:
tf_vectorizer = CountVectorizer(tokenizer=lambda text: jieba.lcut(text), max_df=0.9, min_df=20, stop_words=stopwords)

all_texts  = list(reduce(lambda x, y: x + y, [doc['texts'] for doc in docs]))
all_titles = list(reduce(lambda x, y: x + y, [doc['title'] for doc in docs]))

tf_matrix = tf_vectorizer.fit_transform(all_texts)
print(tf_matrix.shape)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zjxua\AppData\Local\Temp\jieba.cache
Loading model cost 0.766 seconds.
Prefix dict has been built succesfully.


(7352, 10688)


In [6]:
%%time
lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

docres = lda.fit_transform(tf_matrix)

Wall time: 30 s


In [7]:
doc2verb = docres.dot(lda.components_)
print('全部标题:\t {0} '.format(len(all_titles)))
print('话题 * 词:\t {0} '.format(lda.components_.shape))
print('文档 * 话题:\t {0} '.format(docres.shape))
print('文档 * 词:\t {0} '.format(doc2verb.shape))

全部标题:	 7352 
话题 * 词:	 (10, 10688) 
文档 * 话题:	 (7352, 10) 
文档 * 词:	 (7352, 10688) 


In [8]:
%%time
find_doc_from_verb(tf_vectorizer.get_feature_names(), doc2verb, all_titles, 1111)

word: 中西部 

中央城市工作会议：2020年完成棚户区改造
３年内农业信贷担保体系框架将覆盖全国
京津冀交通规划近期印发深入推进区域运输一体化
助力区域经济国家级经开区谋转型升级
城市发展着力提高持续性宜居性
北京林业大学与鲁能集团展开全方位深度合作
鲁能公布新版发展战略聘贝克汉姆担任形象大使
依托园区转化创新成果涞水产业新城对接京津
科技部将再建山东半岛等一批国家自创区
非首都功能批发市场疏解升级见成效
机器人产业五年路线图出炉 支持企业直接融资并购
Wall time: 16 ms


In [None]:
while True:
    now = time.time()
    i = int(input('输入一个数字'))
    find_doc_from_verb(tf_vectorizer.get_feature_names(), doc2verb, all_titles, i)
    print()
    print('耗时 ', time.time() - now)