In [None]:
import pandas as pd
import numpy as np
import jieba
import re
import collections
import jieba.analyse
import codecs
import nltk
import jieba.posseg as pseg
from nltk.corpus import stopwords 

## Import Data

In [None]:
path = pd.read_csv('diary_pick.txt' , sep='\t',error_bad_lines=False).astype(str)
path.head()

## Jieba

In [None]:
#斷詞存成文件
wf = codecs.open("pick_seg.dataset", "w","utf-8")
with open("diary_pick.txt", "r") as f:
    for line in f:
        words = jieba.cut(line)
        wf.write(" ".join(words))
        #print(" ".join(words))
wf.close()

In [None]:
#看每篇週記的斷詞成果
with open("pick_seg.dataset") as fn:
    for line in fn:
        print(line)

## world cloud

In [None]:
#文字雲
import matplotlib.pyplot as plt
from wordcloud import WordCloud


# 讀取檔案
text = open("pick_seg.dataset").read()
# 建立停用字
stopwords = {}.fromkeys(["分行","顧客","本行"]) #簡單列3個

wc = WordCloud(font_path="NotoSerifCJKtc-Black.otf", #設置字體(需使用可讀中文的字體)
               background_color="white", #背景顏色
               max_words = 100 , #文字雲顯示最大詞數
               stopwords=stopwords) #停用字詞

# 產生文字雲
wc.generate(text)

# 視覺化
plt.imshow(wc)
plt.axis("off")
plt.figure(figsize=(10,6), dpi = 100)
plt.show()



### 主題建模

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from gensim import corpora, models, similarities

In [None]:
# 移除停用字
with open("stopWords.txt") as f:
    stop_word_content = f.readlines()
stop_word_content = [x.strip() for x in stop_word_content] #strip: 移除頭尾空格
stop_word_content = " ".join(stop_word_content)

# 建立本次文檔的語料庫(字典)
# 將文檔裡的詞給編號
dictionary = corpora.Dictionary(document.split() for document in open("pick_seg.dataset"))
stoplist = set(stop_word_content.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id] #dictionary.token2id: 代表什麼字詞對應到什麼id，有幾個id就代表有幾維向量空間
dictionary.filter_tokens(stop_ids) # 移除停用字
dictionary.compactify() #remove faps in id sequence after worfs that were removed
dictionary.save("RM_diary.dict")

In [None]:
# check每個詞的id
for word,index in dictionary.token2id.items(): 
    print(word +" id:"+ str(index))

In [None]:
texts = [[word for word in document.split() if word not in stoplist]
         for document in open("pick_seg.dataset")]

# 移除只出現一次的字詞
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
     for token in text:
         frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] 
          for text in texts]
#---------------------------------------------------------------
#把字典轉成向量空間模式
# 將 corpus 序列化
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize("diary_pick.mm", corpus) # Corpus in Matrix Market format 

In [None]:
#將 corpus 丟入tf-idf 模型 ：將字典中的字詞向量轉換為字詞重要性的向量
# 載入語料庫
if (os.path.exists("RM_diary.dict")):
    dictionary = corpora.Dictionary.load("RM_diary.dict")
    corpus = corpora.MmCorpus("diary_pick.mm") # 將數據流的語料變為內容流的語料
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

In [None]:
# 創 tfidf model
tfidf = models.TfidfModel(corpus)
# 轉為向量表示
corpus_tfidf = tfidf[corpus]

In [None]:
# 創建 LSI model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf] # LSI潛在語義索引
lsi.save('rm_diary.lsi')
corpora.MmCorpus.serialize('lsi_rm_diary.mm', corpus_lsi)
print("LSI topics:")
lsi.print_topics(3)

#會看到每篇週記在每個主題的權重，這裏主題設定為10

In [None]:
# 建立索引
index = similarities.MatrixSimilarity(lsi[corpus]) 
index.save("rm_diary.index") 

# 計算相似度（前五名）
sims = index[vec_lsi] 
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims[:5])