In [1]:
import pandas as pd
import numpy as np
import sys,codecs
import jieba.posseg
import jieba.analyse
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
"""
       TF-IDF权重：
           1、CountVectorizer 构建词频矩阵
           2、TfidfTransformer 构建tfidf权值计算
           3、文本的关键字
           4、对应的tfidf矩阵
"""
# 数据预处理操作：分词，去停用词，词性筛选
def dataPrepos(text, stopkey):
    l = []
    pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd']  # 定义选取的词性
    seg = jieba.posseg.cut(text)  # 分词
    for i in seg:
        if i.word not in stopkey and i.flag in pos:  # 去停用词 + 词性筛选
            l.append(i.word)
    return l

# tf-idf获取文本top10关键词
def getKeywords_tfidf(data,stopkey,topK):
    idList, titleList, abstractList = data['id'], data['title'], data['abstract']
    corpus = [] # 将所有文档输出到一个list中，一行就是一个文档
    for index in range(len(idList)):
        text = '%s。%s' % (titleList[index], abstractList[index]) # 拼接标题和摘要
        text = dataPrepos(text,stopkey) # 文本预处理
        text = " ".join(text) # 连接成字符串，空格分隔
        corpus.append(text)

    # 1、构建词频矩阵，将文本中的词语转换成词频矩阵
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus) # 词频矩阵,a[i][j]:表示j词在第i个文本中的词频
    # 2、统计每个词的tf-idf权值
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    # 3、获取词袋模型中的关键词
    word = vectorizer.get_feature_names()
    # 4、获取tf-idf矩阵，a[i][j]表示j词在i篇文本中的tf-idf权重
    weight = tfidf.toarray()
    # 5、打印词语权重
    ids, titles, keys = [], [], []
    for i in range(len(weight)):
#        print(u"-------这里输出第", i+1 , u"篇文本的词语tf-idf------")
        ids.append(idList[i])
        titles.append(titleList[i])
        df_word,df_weight = [],[] # 当前文章的所有词汇列表、词汇对应权重列表
        for j in range(len(word)):
#            print(word[j],weight[i][j])
            df_word.append(word[j])
            df_weight.append(weight[i][j])
        df_word = pd.DataFrame(df_word,columns=['word'])
        df_weight = pd.DataFrame(df_weight,columns=['weight'])
        word_weight = pd.concat([df_word, df_weight], axis=1) # 拼接词汇列表和权重列表
        word_weight = word_weight.sort_values(by="weight",ascending = False) # 按照权重值降序排列
        keyword = np.array(word_weight['word']) # 选择词汇列并转成数组格式
        word_split = [keyword[x] for x in range(0,topK)] # 抽取前topK个词汇作为关键词
        word_split = " ".join(word_split)
        keys.append(word_split)
    result = pd.DataFrame({"id": ids, "title": titles, "key": keys},columns=['id','title','key'])
    return result

In [67]:
data = pd.read_csv(f'posts/{comp[7]}.csv')
result = getKeywords_tfidf(data,stopwords,20)
result.to_csv(f"result/{comp[7]}.csv",index=False,encoding='utf_8_sig')

In [2]:
def read_file(path, store_list):
    with open(f'{path}.txt',errors='ignore',encoding = "utf-8") as f:
        for line in f:
            words = line.strip()
            if words not in store_list:
                store_list.append(words)

### load stopwords 中文停用词列表

In [3]:
stopwords = []

path_baidu = 'stopwords/baidu_stopwords'
path_siculab = 'stopwords/scu_stopwords'
path_noram = 'stopwords/cn_stopwords'

read_file(path_baidu,stopwords)
read_file(path_siculab,stopwords)
read_file(path_noram,stopwords)

In [4]:
stopwords.append('知乎')
stopwords.append('一个')
stopwords.append('-')

In [5]:
len(stopwords)

2079

In [None]:
# def main():
#     # 读取数据集
#     dataFile = 'data/sample_data.csv'
#     data = pd.read_csv(dataFile)
#     # 停用词表
#     stopkey = [w.strip() for w in codecs.open('data/stopWord.txt', 'r').readlines()]
#     # tf-idf关键词抽取
#     result = getKeywords_tfidf(data,stopkey,10)
#     result.to_csv("result/keys_TFIDF.csv",index=False)

In [11]:
ls = [172,98,16,28,136,10,20,20,163,17,71,5,49,53,38,311,139,45,21,109,769,156,181,18,8]
comp = ['teng-xun-70','teng-xun-ke-ji','tian-mei-gong-zuo-shi-21','weda-hui','teng-xun-yun-4','teng-yun-zhi-ku','teng-xun-yi-dian','teng-xun-dong-man','teng-xun-li-cai-tong-48','wei-xin-93-75','teng-xun-yan-jiu-yuan-28','teng-xun-fang-shui-qiang','wei-xin-zhi-fu-30','teng-xun-da-xue','teng-xun-you-xi-an-quan','teng-xun-ji-zhu-gong-cheng','teng-xun-wetest-74','teng-xun-da-shu-ju','teng-xun-qq-60','teng-xun-bugly','teng-xun-yun-ji-zhu-she-qu','teng-xun-shou-hu-zhe-ji-hua','teng-xun-an-quan-lian-he-shi-yan-shi','teng-xun-fan-yi-jun','qq-yin-yue']

In [15]:
for i in comp:
    data = pd.read_csv(f'tenx_posts/{i}.csv')
    result = getKeywords_tfidf(data,stopwords,10)
    result.to_csv(f"result/{i}.csv",index=False,encoding='utf_8_sig')
    

In [14]:
# # 把 文章储存在csv 文件里
# for i in range(len(comp)):
#     id_ = 0
#     temp = {'id':[],'title':[],'abstract':[]}
#     path = f'tenx_posts/{comp[i]}'
#     for n in range(ls[i]):
#         try:
#             with open(f'{path}/{n}.txt',encoding = "gb18030") as f:
#                 con = 0
#                 for line in f:
#                     if con == 0:
#                         if line.strip() != '':
#                             temp['title'].append(line.strip())
#                         else:
#                             temp['title'].append('')
#                     elif con == 2:
#                         temp['abstract'].append(line.strip())
#                     con+=1
#                 if con != 3:
#                     temp['abstract'].append('')
#                 temp['id'].append(id_)
#                 id_ += 1

#         except:
#             pass

#     temp = pd.DataFrame(temp)
#     temp.to_csv(f"tenx_posts/{comp[i]}.csv",index=False,encoding='utf_8_sig')
# # result = getKeywords_tfidf(temp,stopwords,10)
# # result

### 去除停用词

In [8]:
import jieba

  # 对句子进行中文分词
def seg_depart(sentence): # stopwords
    sentence_depart = jieba.cut(sentence)

    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stopwords:
            outstr += word
            outstr += " "
    return outstr



### load articles

In [9]:
def topic_visual(file_name,n_features,n_topics,n_top_words): # 文件名，关键词个数，主题个数, 打印主题个数
    # load file & separate words
    temp = []
    path = f'posts/{file_name}'
    with open(f'{path}.txt',encoding = "gb18030") as f: # ①把编码方式utf-8 修改为gb18030, ②把原来的txt文件重新打开另存为的时候，把编码方式修改为utf-8，然后代码的encoding=‘utf-8’保持不变即可
        for line in f:
            words = line.strip()
            temp.append(seg_depart(words))
    
    # 文本向量转换
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    max_features=n_features,
                                    stop_words=stopwords,
                                    max_df = 0.5,
                                    min_df = 1)
    tf = tf_vectorizer.fit_transform(temp)
    # fit LDA
    from sklearn.decomposition import LatentDirichletAllocation
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)
    
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)
    return lda, tf, tf_vectorizer

In [10]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [9]:
comp = ['wan-mei-shi-jie-48-2','zhi-fu-bao-72-4','teng-xun-ke-ji','jing-dong-46-34','teng-xun-70','a-li-ba-ba-23-79','da-zhong-dian-ping-83']

In [22]:
#! pip install pyldavis
import pyLDAvis
import pyLDAvis.sklearn

for i in comp:
    print(i,':')
    lda, tf, tf_vectorizer = topic_visual(i,50,10,10)
    #pyLDAvis.enable_notebook()
    #pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    print('\n')

wan-mei-shi-jie-48-2 :
Topic #0:
动画 2019 作品 2018 制作 市场 企业 合作 行业 业务
Topic #1:
场景 制作 工作 专业 成功 玩家 团队 文化 技术 洪恩
Topic #2:
冰糖 雪梨 观众 专业 该剧 青春 制作 团队 电视剧 成功
Topic #3:
行业 发展 业务 文创 企业 文化 未来 领域 技术 内容
Topic #4:
冰糖 黎语 雪梨 青春 梦想 爱情 少年 甜蜜 观众 电视剧
Topic #5:
团队 市场 研发 产品 工作 技术 玩家 手游 洪恩 行业
Topic #6:
作品 雪梨 该剧 观众 剧集 冰糖 15 专业 青春 带来
Topic #7:
青春 观众 全世界 优酷 剧集 热血 该剧 电视剧 爱情 文化
Topic #8:
王雨蕴 合作 市场 产品 行业 团队 企业 梦想 工作 未来
Topic #9:
带来 玩家 手游 成功 内容 雪梨 冰糖 黎语 观众 团队


zhi-fu-bao-72-4 :
Topic #0:
蚂蚁 区块 金服 数据 团队 系统 业务 场景 工作 公司
Topic #1:
蚂蚁 阿里 森林 工程师 杭州 数据 中国 公司 互联网 科技
Topic #2:
账户 手机 信息 系统 支付 平台 希望 10 团队 提供
Topic #3:
支付 中国 手机 11 场景 全球 体验 服务 希望 上线
Topic #4:
保障 风险 相互 蚂蚁 金服 平台 能力 金融 系统 服务
Topic #5:
服务 程序 提供 平台 上线 超过 业务 生活 蚂蚁 体验
Topic #6:
公司 支持 项目 希望 时间 未来 10 工作 生活 提供
Topic #7:
城市 电子 杭州 全国 服务 未来 支持 时间 上线 超过
Topic #8:
信用 商家 体验 产品 11 解决 10 风险 未来 能力
Topic #9:
中国 全球 互联网 发展 科技 平台 生活 服务 工作 信息


teng-xun-ke-ji :
Topic #0:
5g 网络 亿元 全球 2017 人工智能 2014 资本 手机 华为
Topic #1:
手机 oppo facebook 华为 苹果 渠道 小米 品牌 国内 销售
Topic #2:
滴滴 ofo 合并 城市 融资 竞争 资本 

In [21]:
name = 'bai-du-81-39'
print(name,':')
lda, tf, tf_vectorizer = topic_visual(name,100,5,5)


bai-du-81-39 :
Topic #0:
拒绝 蝙蝠 保护 公共卫生 野味
Topic #1:
复工 指数 地图 城市 全国
Topic #2:
网络 黑产 报告 复工 企业
Topic #3:
黑产 规模 网络 犯罪 治理
Topic #4:
热度 产业 知识 教育 公共卫生


In [173]:
# ls = [76,296,98,568,172,45,79,7]
# comp = ['wan-mei-shi-jie-48-2','zhi-fu-bao-72-4','teng-xun-ke-ji','jing-dong-46-34','teng-xun-70','a-li-ba-ba-23-79','da-zhong-dian-ping-83','bai-du-81-39']

# def merge(name,l):
#     temp = []
#     path = f'posts/{name}'
#     title = []
#     for n in range(l):
#         try:
#             with open(f'{path}/{n}.txt',encoding = "gb18030") as f:
#                 con = 0
#                 post = ''
#                 for line in f:
#                     if con == 0:
#                         post += line.strip()
#                     elif con == 2:
#                         post += line.strip()
#                     con+=1
#                 temp.append(post)
#         except:
#             pass
#     for n in range(l):
#         with open(f'{path}.txt','w',encoding = "gb18030") as f:
#             for i in temp:
#                 f.write(i+'\n')


# for i in range(len(ls)):
#     merge(comp[i],ls[i])

In [2]:
comp = ['wan-mei-shi-jie-48-2','zhi-fu-bao-72-4','teng-xun-ke-ji','jing-dong-46-34','teng-xun-70','a-li-ba-ba-23-79','da-zhong-dian-ping-83']
names = ['完美世界','支付宝','腾讯科技','京东','腾讯','阿里巴巴','大众点评']

In [16]:
from jieba.analyse import *
for i in range(len(comp)):
    file = pd.read_csv(f'result/{comp[i]}.csv')
    data = ''
    for j in file['key']:
        data += j
    data = seg_depart(data)
    df1 = {'keyword':[],'weight':[]}
    for keyword, weight in extract_tags(data, topK=10, withWeight=True):
        df1['keyword'].append(keyword)
        df1['weight'].append(weight)
    df2 = {'keyword':[],'weight':[]}
    for keyword, weight in textrank(data, withWeight=True):
        df2['keyword'].append(keyword)
        df2['weight'].append(weight)
    df1 = pd.DataFrame(df1)
    df2 = pd.DataFrame(df2)
    df1.to_csv(f"result/{comp[i]}_tag.csv",index=False,encoding='utf_8_sig')
    df2.to_csv(f"result/{comp[i]}_textrank.csv",index=False,encoding='utf_8_sig')

In [17]:

for i in range(len(comp)):
    file = pd.read_csv(f'result/{comp[i]}.csv')
    data = ''
    for j in file['title']:
        data += j
    data = seg_depart(data)
    df1 = {'keyword':[],'weight':[]}
    for keyword, weight in extract_tags(data, topK=10, withWeight=True):
        df1['keyword'].append(keyword)
        df1['weight'].append(weight)
    df2 = {'keyword':[],'weight':[]}
    for keyword, weight in textrank(data, withWeight=True):
        df2['keyword'].append(keyword)
        df2['weight'].append(weight)
    df1 = pd.DataFrame(df1)
    df2 = pd.DataFrame(df2)
    df1.to_csv(f"result/{comp[i]}_title_tag.csv",index=False,encoding='utf_8_sig')
    df2.to_csv(f"result/{comp[i]}_title_textrank.csv",index=False,encoding='utf_8_sig')

### 查看各个公司文章的关键字|
['腾讯科技', '腾讯', '微信支付', '腾讯技术工程', '腾讯安全联合实验室', '腾讯医典', '腾讯云技术社区']

'[    0          ,      1     ,      2           ,        3               ,                4             ,            5          ,            6]'

['teng-xun-ke-ji', 'teng-xun-70', 'wei-xin-zhi-fu-30', 'teng-xun-ji-zhu-gong-cheng', 'teng-xun-an-quan-lian-he-shi-yan-shi', 'teng-xun-yi-dian', 'teng-xun-yun-ji-zhu-she-qu']

In [3]:
tokens = ['teng-xun-ke-ji', 'teng-xun-70', 'wei-xin-zhi-fu-30', 'teng-xun-ji-zhu-gong-cheng', 'teng-xun-an-quan-lian-he-shi-yan-shi', 'teng-xun-yi-dian', 'teng-xun-yun-ji-zhu-she-qu']

In [42]:
x = tokens[-3]
x

'teng-xun-an-quan-lian-he-shi-yan-shi'

In [43]:
data = pd.read_csv(f"result/{x}_tag.csv")
data

Unnamed: 0,keyword,weight
0,腾讯,0.206628
1,木马,0.186027
2,挖矿,0.183858
3,攻击,0.152561
4,病毒,0.143831
5,漏洞,0.139813
6,电脑,0.094576
7,数据安全,0.09392
8,企业,0.081863
9,网络安全,0.070754


In [44]:
data = pd.read_csv(f"result/{x}_textrank.csv")
data

Unnamed: 0,keyword,weight
0,攻击,1.0
1,企业,0.82311
2,病毒,0.753241
3,漏洞,0.746579
4,木马,0.739186
5,挖矿,0.667
6,电脑,0.547735
7,数据,0.516221
8,业务,0.394532
9,实验室,0.392453


In [45]:
data = pd.read_csv(f"result/{x}_title_tag.csv")
data

Unnamed: 0,keyword,weight
0,腾讯,0.419259
1,2018,0.175314
2,挖矿,0.154499
3,木马,0.140689
4,病毒,0.124317
5,产业,0.103035
6,报告,0.094605
7,电脑,0.093295
8,勒索,0.087588
9,专家,0.082151


In [46]:
data = pd.read_csv(f"result/{x}_title_textrank.csv")
data

Unnamed: 0,keyword,weight
0,企业,1.0
1,木马,0.995104
2,病毒,0.93121
3,报告,0.887339
4,挖矿,0.844452
5,全球,0.70423
6,攻击,0.659433
7,电脑,0.633901
8,产业,0.585073
9,漏洞,0.5441
