reference: https://github.com/LeanManager/NLP_Technical_Founders/blob/master/Unsupervised%20Learning%20in%20NLP.ipynb

## 1. Read in Data

In [2]:
import pandas as pd

data = pd.read_parquet('../data/train.parquet', engine='pyarrow')

In [3]:
data.sample(20)

Unnamed: 0,NEWS_TITLE,NEWS_ORIGIN_SOURCE,NEWS_ID
32782666,连平：上半年还将降准1-2次 短期降息可能性不大,金融界网站,9387658
38220229,传阿里巴巴拟8000万美元投资美国电商Boxed,,15887876
42272942,天恒联手春雨医生首发5G健康体系 倡导互联网+健康社区,,20650101
40231129,苹果iPhone 6S Plus(全网通)售5550元,,18197459
42285498,"强强联合,助力中国电气产业在＂一带一路＂中走出去",,20664890
36276140,环保部出动无人机开展巡查 打击违法排污企业,,13716146
44628753,我国首个无人船研发测试基地落户珠海,,23399593
47040124,奥特佳新能源科技股份有限公司关于控股股东股份解除质押的公告,,27364706
40266387,美防长公布新财年国防预算申请 打击IS经费涨50%,,18238635
43569960,今年石油年均价不会高于每桶50美元,中国经济网—《经济日报》,22015713


In [4]:
data = data.sample(100000)

## 2. Preprocessing data
1. 去标点符号
2. 分词
3. 移除stopwords

In [5]:
import re
punc = ''' "()\[\]\{\}！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'''

def puncRemover(s):
    return re.sub(r'[%s]'%punc, "", s)

line = "测试。。去 除标点。。，、！"
# print(re.sub(r'[%s]'%punc, "", line))
print(puncRemover(line))

测试去除标点


In [6]:
news_title = data.NEWS_TITLE
news_title = news_title.apply(puncRemover)

In [7]:
news_title

42909977                   南海区人民医院举办舞会迎护士节
39257559                     万科独董海闻钜盛华增持H股
46867837               柳工盈利能力有所提高国际化转型不断提速
40344242         天津市委常委等5名省部级官员被建议给予党纪政纪处分
41433122    3月28日ishares黄金持仓持平/白银持仓增加2664吨
                         ...              
43694664                    周二机构强推买入6股极度低估
44952856                金字火腿终止重大资产重组27日起复牌
32795400                         四公司首发申请获准
38880886                       日本式教育何以输出海外
33440554                  小伙6年喊睡不着原来是矛盾性失眠
Name: NEWS_TITLE, Length: 100000, dtype: object

In [8]:
import jieba

In [9]:
news_title = news_title.apply(lambda x: list(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/x7/bnm7sy4j0bj1fs53ln11sg_80000gn/T/jieba.cache
Loading model cost 0.519 seconds.
Prefix dict has been built successfully.


In [10]:
news_title

42909977                        [南海区, 人民, 医院, 举办, 舞会, 迎, 护士节]
39257559                          [万科, 独董, 海闻, 钜, 盛华, 增持, H股]
46867837                  [柳工, 盈利, 能力, 有所提高, 国际化, 转型, 不断, 提速]
40344242    [天津, 市委常委, 等, 5, 名, 省部级, 官员, 被, 建议, 给予, 党纪政纪, 处分]
41433122    [3, 月, 28, 日, ishares, 黄金, 持仓, 持平, /, 白银, 持仓, ...
                                  ...                        
43694664                       [周二, 机构, 强推, 买入, 6, 股, 极度, 低估]
44952856                   [金字, 火腿, 终止, 重大, 资产重组, 27, 日起, 复牌]
32795400                                  [四, 公司, 首发, 申请, 获准]
38880886                                [日本式, 教育, 何以, 输出, 海外]
33440554                   [小伙, 6, 年, 喊, 睡不着, 原来, 是, 矛盾性, 失眠]
Name: NEWS_TITLE, Length: 100000, dtype: object

In [11]:
list(jieba.cut(data.NEWS_TITLE.iloc[0]))

['南海区', '人民', '医院', '举办', '舞会', '迎', '护士节']

In [12]:
cn_stopwords = pd.read_csv('../data/cn_stopwords.txt', header=None)

cn_stopwords = set(cn_stopwords.iloc[:,0])

In [13]:
"在" in cn_stopwords

True

In [14]:
news_title = news_title.apply(lambda x: [i for i in x if i not in cn_stopwords])

In [15]:
news_title

42909977                        [南海区, 人民, 医院, 举办, 舞会, 迎, 护士节]
39257559                          [万科, 独董, 海闻, 钜, 盛华, 增持, H股]
46867837                  [柳工, 盈利, 能力, 有所提高, 国际化, 转型, 不断, 提速]
40344242             [天津, 市委常委, 名, 省部级, 官员, 建议, 给予, 党纪政纪, 处分]
41433122    [月, 28, 日, ishares, 黄金, 持仓, 持平, /, 白银, 持仓, 增加,...
                                  ...                        
43694664                          [周二, 机构, 强推, 买入, 股, 极度, 低估]
44952856                   [金字, 火腿, 终止, 重大, 资产重组, 27, 日起, 复牌]
32795400                                  [四, 公司, 首发, 申请, 获准]
38880886                                    [日本式, 教育, 输出, 海外]
33440554                         [小伙, 年, 喊, 睡不着, 原来, 矛盾性, 失眠]
Name: NEWS_TITLE, Length: 100000, dtype: object

## 3.1 Bag of words on the dataset

In [16]:
import gensim

In [17]:
dictionary = gensim.corpora.Dictionary(news_title)

In [18]:
count = 0

for k,v in dictionary.iteritems():
    print(k,v)
    count += 1
    if count > 10:
        break

0 举办
1 人民
2 医院
3 南海区
4 护士节
5 舞会
6 迎
7 H股
8 万科
9 增持
10 海闻


In [19]:
# remove very common and very rare words
dictionary.filter_extremes(no_below=3, no_above=0.7, keep_n = 100000)

In [20]:
dictionary.num_docs

100000

In [21]:
# doc2bow(doc)
# Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. 
# Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). 
# No further preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

In [22]:
bow_corpus = [dictionary.doc2bow(doc) for doc in news_title]

In [23]:
bow_corpus[1]

[(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]

In [24]:
news_title.iloc[1]

['万科', '独董', '海闻', '钜', '盛华', '增持', 'H股']

In [25]:
example_num = 244
example_doc = bow_corpus[example_num]
print("original news:", news_title.iloc[example_num])
for i in range(len(example_doc)):
    print(f"word ({example_doc[i][0]}, {dictionary[example_doc[i][0]]}) appears {example_doc[i][1]} times")

original news: ['你品', '咖啡', '品什']
word (1368, 咖啡) appears 1 times


In [26]:
bow_corpus[1]

[(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]

## 3.2 TF-IDF on document set

In [27]:
from gensim import corpora, models 

tfidf = models.TfidfModel(bow_corpus)
print(tfidf)

TfidfModel<num_docs=100000, num_nnz=763788>


In [28]:
corpus_tfidf = tfidf[bow_corpus]
print(corpus_tfidf[1])

[(4, 0.32176819421611325), (5, 0.2693261910783195), (6, 0.24957520787128146), (7, 0.48719090186963676), (8, 0.40888078438286046), (9, 0.4357968346219557), (10, 0.40888078438286046)]


## 4.1 Running LDA using bag of words

In [27]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                      num_topics=10,
                                      id2word=dictionary,
                                      passes=2,
                                      workers=8)

In [28]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.043*"月" + 0.033*"日" + 0.012*"价格" + 0.012*"年" + 0.009*"10" + 0.008*"最新" + 0.007*"12" + 0.007*"2016" + 0.007*"动态" + 0.006*"投资"


Topic: 1 
Words: 0.009*"股东" + 0.008*"公告" + 0.008*"基金" + 0.008*"股份" + 0.007*"机构" + 0.006*"股" + 0.006*"中国" + 0.006*"有限公司" + 0.005*"投资" + 0.005*":"


Topic: 2 
Words: 0.015*"新" + 0.015*"月" + 0.011*"日" + 0.008*"三板" + 0.007*"价格" + 0.007*"生意" + 0.006*"社" + 0.005*"公司" + 0.005*"市场" + 0.004*"动态"


Topic: 3 
Words: 0.014*":" + 0.007*"市场" + 0.006*"美联储" + 0.006*"中国" + 0.006*"加息" + 0.006*"元" + 0.006*"," + 0.005*"收盘" + 0.005*"美元" + 0.005*"震荡"


Topic: 4 
Words: 0.016*"月" + 0.010*"市场" + 0.008*"元" + 0.008*"日" + 0.007*"资金" + 0.006*"快讯" + 0.005*"亿元" + 0.005*"涨停" + 0.005*"人民币" + 0.005*"央行"


Topic: 5 
Words: 0.015*"中国" + 0.007*"股" + 0.007*"发展" + 0.005*":" + 0.005*"公司" + 0.005*"年" + 0.005*"银行" + 0.004*"股份" + 0.004*"亿" + 0.004*"创新"


Topic: 6 
Words: 0.017*":" + 0.011*"," + 0.010*"行业" + 0.006*"投资" + 0.005*"沪" + 0.005*"月" + 0.005*"业绩" + 0.004*"增长" + 0.004*"年" + 0.

## 4.2 Running LDA using TF-IDF

In [29]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=10, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=8)

In [30]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.004*"月" + 0.003*"日" + 0.003*"中国" + 0.003*":" + 0.002*"市场" + 0.002*"股" + 0.002*"新" + 0.002*"板块" + 0.002*"动态" + 0.002*"公司"


Topic: 1 Word: 0.005*"月" + 0.004*"日" + 0.004*"中国" + 0.004*"市场" + 0.003*"经济" + 0.003*"价格" + 0.003*":" + 0.002*"黄金" + 0.002*"上海" + 0.002*"美国"


Topic: 2 Word: 0.004*"月" + 0.004*"中国" + 0.003*"日" + 0.003*"公告" + 0.003*":" + 0.003*"市场" + 0.002*"有限公司" + 0.002*"股份" + 0.002*"新" + 0.002*"价格"


Topic: 3 Word: 0.004*"-" + 0.003*"月" + 0.003*"新" + 0.003*"中国" + 0.003*":" + 0.003*"年" + 0.003*"日" + 0.002*"10" + 0.002*"股东" + 0.002*"股份"


Topic: 4 Word: 0.004*"沪" + 0.004*"月" + 0.004*":" + 0.003*"指" + 0.003*"中国" + 0.002*"年" + 0.002*"市场" + 0.002*"," + 0.002*"期货" + 0.002*"日"


Topic: 5 Word: 0.007*"年" + 0.006*"月" + 0.004*"2015" + 0.004*"日" + 0.004*"2016" + 0.004*"万元" + 0.004*":" + 0.004*"提示" + 0.003*"-" + 0.003*"最新"


Topic: 6 Word: 0.004*"中国" + 0.002*":" + 0.002*"新" + 0.002*"机构" + 0.002*"公司" + 0.002*"年" + 0.002*"月" + 0.002*"投资" + 0.002*"市场" + 0.002*"基金"


Topic: 7 Word