# import

In [19]:
import numpy as np
import pandas as pd

from Ocab import Ocab, Regexp # https://github.com/boomin614/Ocab
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.matutils import corpus2dense

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from wordcloud import WordCloud

# 関数などの定義

In [20]:
# bigquery
project_id = "your_GCP_project_id"
private_key = "path_to_your_private_key"

# Ocabクラスの形態素解析用インスタンスを作成。
replace_rule = {
    '名詞': {
        '非自立': "",
        '固有名詞': {
            '人名': "",
            '地域': "{地名}"
        }
    }
}
ocab = Ocab(
    opO="-Ochasen -u /root/work/pstack/ppp/utils/mecab_user_dic.dic",
    target=["名詞", "形容詞", "形容動詞", "動詞"], replace_rule=replace_rule)

c = Regexp()

def tokenize_text(text):
    text = str(text)
    text = text.replace("\n","。") + "。"
    text = text.replace("。+","。")
    text = c.normalize(text)
    text = ocab.wakati(text)
    if type(text) != type(""):
        text = ""
    text = ocab.removeStoplist(text, [])
    
    return text.split()

In [21]:
def get_word_cloud(pdf_path, lda, nrows, ncols, figsize, dpi=150):
    # pdfにエクスポートして、notebookには作図しない。
    wc = WordCloud(font_path='/usr/share/fonts/truetype/takao-gothic/TakaoGothic.ttf')
    pdf = PdfPages(pdf_path)
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, dpi=dpi)

    for k, ax in enumerate(axes.flat):
        topic_word_freq = dict(lda.show_topic(k, 100))
        wc_plt = wc.generate_from_frequencies(topic_word_freq)
        ax.imshow(wc_plt, interpolation='bilinear')
        ax.axis("off")
        ax.set_title("トピック{} ワードクラウド".format(k))

    fig.tight_layout()
    pdf.savefig()
    pdf.close()
    fig.clf()

def get_km_distortion(k, X):
    km = KMeans(n_clusters=k, random_state=0, n_jobs=1)
    km.fit(X)
    return km.inertia_

# テキストデータ準備

In [43]:
query = "query_string"
res = pd.read_gbq(query, project_id=project_id, private_key=private_key,dialect='standard')
res.to_csv("text_file.txt", index=False)

# gensim

## create dictionary

In [6]:
texts = (tokenize_text(line) for line in open('text_file.txt'))
dic = corpora.Dictionary(texts)
noise_words = [
    'Д','o','w','m','ー','ω','xE','艸','д','いたす','v','とる','ちゃう','いう','言う','頂ける','会意','下さる','けす',
    '模試','笑笑','よい','ょ','ゎ','⁾⁾','える','致す','k','ら'
]
noise_ids = [dic.token2id[word] 
             for word in noise_words 
             if word in dic.token2id]

dic.filter_tokens(bad_ids=noise_ids)
dic.filter_extremes(no_below=30, no_above=0.30)
dic.save("./model_gensim/vocab.dict")

## メモリに優しいbowイテレータを作成

In [8]:
class Corpus2bow(object):
    def __init__(self, file_path):
        self.file_path = file_path
    
    def __iter__(self):
        for line in open(self.file_path):
            yield dic.doc2bow(tokenize_text(line))

corpus = Corpus2bow("text_file.txt")

## train lda

In [None]:
lda = LdaModel(corpus=corpus, num_topics=30, id2word=dic) # 任意のトピック数を指定
lda.save("./model_gensim/model.lda")

## ワードクラウドで評価
トピックの特性をワードクラウドで確認し、トピック数の妥当性を検討します。

In [9]:
%%time
get_word_cloud('./interpret/lda_wcloud.pdf', lda, nrows=10, ncols=2, figsize=(10,30), dpi=150)

CPU times: user 10.9 s, sys: 1.72 s, total: 12.6 s
Wall time: 11.3 s


<matplotlib.figure.Figure at 0x7f03b94151d0>