reference: https://github.com/LeanManager/NLP_Technical_Founders/blob/master/Unsupervised%20Learning%20in%20NLP.ipynb

## 1. Read in Data

In [1]:
import pandas as pd

data = pd.read_parquet('../data/train.parquet', engine='pyarrow')

In [2]:
data.sample(20)

Unnamed: 0,NEWS_TITLE,NEWS_ORIGIN_SOURCE,NEWS_ID
45171721,二维码支付从叫停到认可 支付市场格局将如何重构？,,24257553
34610996,乐视搭上联合国开发计划署 共同推进全球可持续发展,,11685134
35745983,散户如何正确开展“抗灾自救”？,,13119429
34508167,摊上尿毒症花钱一辈子,齐鲁晚报,11546820
47214012,Win10更新14977又现BUG：微软表示那只能等下次更新,,27673826
41665505,永清环保4月7日涨停,,19953960
40792793,在皖全国人大代表抵京 积极传递“安徽好声音”,,18839469
34316140,沪指翻红涨0.1%收复4400点,,11309296
33117104,“金融商城”解燃眉之急,,9792408
47128044,"A股市场风格轮动周报:金融板块正强势,短期内可继续关注",,27523491


In [3]:
data = data.sample(100000)

## 2. Preprocessing data
1. 去标点符号
2. 分词
3. 移除stopwords

In [4]:
import re
punc = ''' "()\[\]\{\}！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'''

def puncRemover(s):
    return re.sub(r'[%s]'%punc, "", s)

line = "测试。。去 除标点。。，、！"
# print(re.sub(r'[%s]'%punc, "", line))
print(puncRemover(line))

测试去除标点


In [5]:
news_title = data.NEWS_TITLE
news_title = news_title.apply(puncRemover)

In [6]:
news_title

45529827                        中兴赵先明新主张
36754943                    哪些是工业40的细分领域
36385443                           三步踩的妻
41062438    美国拍卖3个月短期国库券收益率为0335%认购比率347
42344517         4月26日新三板募资12亿9家企业发布定增预案
                        ...             
40792630             前瞻两会积极财政政策再发力减税费成焦点
33812364             三股价值被市场严重低估未来股价翻番在即
38853522                   官员感谢八项规定是可喜信号
37609623              今是昨非10月13日商品期货操作建议
37242086              捷豹首款SUV价格首曝光售276万起
Name: NEWS_TITLE, Length: 100000, dtype: object

In [7]:
import jieba

In [8]:
news_title = news_title.apply(lambda x: list(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/x7/bnm7sy4j0bj1fs53ln11sg_80000gn/T/jieba.cache
Loading model cost 0.668 seconds.
Prefix dict has been built successfully.


In [9]:
news_title

45529827                                     [中兴, 赵先明, 新, 主张]
36754943                           [哪些, 是, 工业, 40, 的, 细分, 领域]
36385443                                        [三步, 踩, 的, 妻]
41062438    [美国, 拍卖, 3, 个, 月, 短期, 国库券, 收益率, 为, 0335%, 认购, ...
42344517    [4, 月, 26, 日新, 三板, 募资, 12, 亿, 9, 家, 企业, 发布, 定增...
                                  ...                        
40792630                [前瞻, 两会, 积极, 财政政策, 再, 发力, 减税, 费成, 焦点]
33812364            [三股, 价值, 被, 市场, 严重, 低估, 未来, 股价, 翻番, 在, 即]
38853522                          [官员, 感谢, 八项, 规定, 是, 可喜, 信号]
37609623                 [今是昨非, 10, 月, 13, 日, 商品, 期货, 操作, 建议]
37242086                 [捷豹, 首款, SUV, 价格, 首, 曝光, 售, 276, 万起]
Name: NEWS_TITLE, Length: 100000, dtype: object

In [10]:
list(jieba.cut(data.NEWS_TITLE.iloc[0]))

['中兴', '赵先明', '新', '主张']

In [11]:
cn_stopwords = pd.read_csv('../data/cn_stopwords.txt', header=None)

cn_stopwords = set(cn_stopwords.iloc[:,0])

In [12]:
"在" in cn_stopwords

True

In [13]:
news_title = news_title.apply(lambda x: [i for i in x if i not in cn_stopwords])

In [14]:
news_title

45529827                                 [中兴, 赵先明, 新, 主张]
36754943                                 [工业, 40, 细分, 领域]
36385443                                       [三步, 踩, 妻]
41062438    [美国, 拍卖, 月, 短期, 国库券, 收益率, 0335%, 认购, 比率, 347]
42344517    [月, 26, 日新, 三板, 募资, 12, 亿, 家, 企业, 发布, 定增, 预案]
                                ...                      
40792630               [前瞻, 两会, 积极, 财政政策, 发力, 减税, 费成, 焦点]
33812364                 [三股, 价值, 市场, 严重, 低估, 未来, 股价, 翻番]
38853522                         [官员, 感谢, 八项, 规定, 可喜, 信号]
37609623             [今是昨非, 10, 月, 13, 日, 商品, 期货, 操作, 建议]
37242086             [捷豹, 首款, SUV, 价格, 首, 曝光, 售, 276, 万起]
Name: NEWS_TITLE, Length: 100000, dtype: object

## 3.1 Bag of words on the dataset

In [15]:
import gensim

In [16]:
dictionary = gensim.corpora.Dictionary(news_title)

In [17]:
count = 0

for k,v in dictionary.iteritems():
    print(k,v)
    count += 1
    if count > 10:
        break

0 中兴
1 主张
2 新
3 赵先明
4 40
5 工业
6 细分
7 领域
8 三步
9 妻
10 踩


In [18]:
# remove very common and very rare words
dictionary.filter_extremes(no_below=3, no_above=0.7, keep_n = 100000)

In [19]:
dictionary.num_docs

100000

In [20]:
# doc2bow(doc)
# Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. 
# Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). 
# No further preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

In [21]:
bow_corpus = [dictionary.doc2bow(doc) for doc in news_title]

In [22]:
bow_corpus[1]

[(4, 1), (5, 1), (6, 1), (7, 1)]

In [23]:
news_title.iloc[1]

['工业', '40', '细分', '领域']

In [24]:
example_num = 244
example_doc = bow_corpus[example_num]
print("original news:", news_title.iloc[example_num])
for i in range(len(example_doc)):
    print(f"word ({example_doc[i][0]}, {dictionary[example_doc[i][0]]}) appears {example_doc[i][1]} times")

original news: ['2016', '年', '月份', '居民消费', '价格', '同比', '上涨', '23%']
word (171, 年) appears 1 times
word (295, 价格) appears 1 times
word (315, 2016) appears 1 times
word (337, 月份) appears 1 times
word (776, 上涨) appears 1 times
word (777, 同比) appears 1 times
word (1357, 23%) appears 1 times
word (1358, 居民消费) appears 1 times


## 3.2 TF-IDF on document set

In [25]:
from gensim import corpora, models 

tfidf = models.TfidfModel(bow_corpus)
print(tfidf)

TfidfModel<num_docs=100000, num_nnz=763903>


In [26]:
corpus_tfidf = tfidf[bow_corpus]
print(corpus_tfidf[1])

[(4, 0.4545313910971169), (5, 0.4443384361593428), (6, 0.6334341764277164), (7, 0.44127736492157643)]


## 4.1 Running LDA using bag of words

In [27]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                      num_topics=10,
                                      id2word=dictionary,
                                      passes=2,
                                      workers=8)

In [28]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.043*"月" + 0.033*"日" + 0.012*"价格" + 0.012*"年" + 0.009*"10" + 0.008*"最新" + 0.007*"12" + 0.007*"2016" + 0.007*"动态" + 0.006*"投资"


Topic: 1 
Words: 0.009*"股东" + 0.008*"公告" + 0.008*"基金" + 0.008*"股份" + 0.007*"机构" + 0.006*"股" + 0.006*"中国" + 0.006*"有限公司" + 0.005*"投资" + 0.005*":"


Topic: 2 
Words: 0.015*"新" + 0.015*"月" + 0.011*"日" + 0.008*"三板" + 0.007*"价格" + 0.007*"生意" + 0.006*"社" + 0.005*"公司" + 0.005*"市场" + 0.004*"动态"


Topic: 3 
Words: 0.014*":" + 0.007*"市场" + 0.006*"美联储" + 0.006*"中国" + 0.006*"加息" + 0.006*"元" + 0.006*"," + 0.005*"收盘" + 0.005*"美元" + 0.005*"震荡"


Topic: 4 
Words: 0.016*"月" + 0.010*"市场" + 0.008*"元" + 0.008*"日" + 0.007*"资金" + 0.006*"快讯" + 0.005*"亿元" + 0.005*"涨停" + 0.005*"人民币" + 0.005*"央行"


Topic: 5 
Words: 0.015*"中国" + 0.007*"股" + 0.007*"发展" + 0.005*":" + 0.005*"公司" + 0.005*"年" + 0.005*"银行" + 0.004*"股份" + 0.004*"亿" + 0.004*"创新"


Topic: 6 
Words: 0.017*":" + 0.011*"," + 0.010*"行业" + 0.006*"投资" + 0.005*"沪" + 0.005*"月" + 0.005*"业绩" + 0.004*"增长" + 0.004*"年" + 0.

## 4.2 Running LDA using TF-IDF

In [29]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=10, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=8)

In [30]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.004*"月" + 0.003*"日" + 0.003*"中国" + 0.003*":" + 0.002*"市场" + 0.002*"股" + 0.002*"新" + 0.002*"板块" + 0.002*"动态" + 0.002*"公司"


Topic: 1 Word: 0.005*"月" + 0.004*"日" + 0.004*"中国" + 0.004*"市场" + 0.003*"经济" + 0.003*"价格" + 0.003*":" + 0.002*"黄金" + 0.002*"上海" + 0.002*"美国"


Topic: 2 Word: 0.004*"月" + 0.004*"中国" + 0.003*"日" + 0.003*"公告" + 0.003*":" + 0.003*"市场" + 0.002*"有限公司" + 0.002*"股份" + 0.002*"新" + 0.002*"价格"


Topic: 3 Word: 0.004*"-" + 0.003*"月" + 0.003*"新" + 0.003*"中国" + 0.003*":" + 0.003*"年" + 0.003*"日" + 0.002*"10" + 0.002*"股东" + 0.002*"股份"


Topic: 4 Word: 0.004*"沪" + 0.004*"月" + 0.004*":" + 0.003*"指" + 0.003*"中国" + 0.002*"年" + 0.002*"市场" + 0.002*"," + 0.002*"期货" + 0.002*"日"


Topic: 5 Word: 0.007*"年" + 0.006*"月" + 0.004*"2015" + 0.004*"日" + 0.004*"2016" + 0.004*"万元" + 0.004*":" + 0.004*"提示" + 0.003*"-" + 0.003*"最新"


Topic: 6 Word: 0.004*"中国" + 0.002*":" + 0.002*"新" + 0.002*"机构" + 0.002*"公司" + 0.002*"年" + 0.002*"月" + 0.002*"投资" + 0.002*"市场" + 0.002*"基金"


Topic: 7 Word