In [1]:
import numpy as np
import pandas as pd
import nltk

# 使用データ  
1つ目の課題と同様に、doc_dir内のdoc1とdoc2を使用。  
それぞれ50件ずつ抽出して、100件の文書データを使用  
すなわち、文書番号0から49はdoc1、文書番号50から99はdoc2である。  
・ doc1 : 夏目漱石「私の個人主義」  
・ doc2 : 宮沢賢治「セロ弾きのゴーシュ」  

In [2]:
# データ読み込み
doc_origin_list = []
for i in range(1,3):
    with open("./doc_dir/doc{}.txt".format(i)) as f:
        s = f.read()
        doc_origin_list.append(s.split("\n"))

# １００件のデータを取得
raw_docs = []
for i in range(2):
    for j in range(100):
        if doc_origin_list[i][j] != "":
            raw_docs.append(doc_origin_list[i][j])

In [3]:
len(raw_docs)

100

# 前処理

In [4]:
import re
def cleaning_text(text):
    # 改行コードの削除
    text = re.sub(r"[\r\n]", '', text)    
    # 空欄の削除
    text = re.sub(r"[\u3000 \t]", '', text)    
    # 句読点などの記号を削除
    text = re.sub("[。、―「」!?｜［］＃#)@:]", '', text)
    return text

In [5]:
doc_list = []
for doc in raw_docs:
    new_doc = cleaning_text(doc)
    doc_list.append(new_doc)

In [13]:
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome import tokenfilter, charfilter

char_filters = [charfilter.UnicodeNormalizeCharFilter()]
token_filters = [tokenfilter.POSKeepFilter(['名詞'])]
tokenizer = Tokenizer()
analyzer = Analyzer(char_filters, tokenizer, token_filters)

def remove_one_length(token_list):
    for word in token_list:
        if len(word) == 1:
            token_list.remove(word)

doc_token = []
for doc in doc_list:
    tmp_token = []
    for token in analyzer.analyze(doc):
        tmp_token.append(token.surface)
        remove_one_length(tmp_token)
    doc_token.append(tmp_token)

In [21]:
doc_token[0][:11]

['時間', '批評', 'はず', 'ため', '懊悩', '参考', '注文', '本領', 'ネルソン', 'さん', '奔走']

# LDA

In [22]:
from gensim.corpora import Dictionary
from gensim.models import word2vec
from gensim.models.ldamodel import LdaModel

In [23]:
# 辞書 : 単語ID・単語・単語出現回数
dic = Dictionary(doc_token)
print(dic)
# コーパス・・・(単語ID,出現頻度)
corpus = [dic.doc2bow(s) for s in doc_token]
print("corpus(前5つ) : "+str(corpus[1][:5]))

Dictionary(1219 unique tokens: ['あと', 'あなた', 'いっぱい', 'うち', 'お話']...)
corpus(前5つ) : [(1, 1), (3, 1), (7, 1), (12, 1), (16, 2)]


# 複数のトピック数を用いて結果を出力
k= 2,5,10,15で実施

In [24]:
import pyLDAvis.gensim

In [25]:
k=2
lda = LdaModel(corpus = corpus, id2word = dic, num_topics = k, alpha = "auto")

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs = 1, sort_topics = False)
pyLDAvis.save_html(lda_display,'vis{}.html'.format(k))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
k=5
lda = LdaModel(corpus = corpus, id2word = dic, num_topics = k, alpha = "auto")

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs = 1, sort_topics = False)
pyLDAvis.save_html(lda_display,'vis{}.html'.format(k))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [27]:
k=10
lda = LdaModel(corpus = corpus, id2word = dic, num_topics = k, alpha = "auto")

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs = 1, sort_topics = False)
pyLDAvis.save_html(lda_display,'vis{}.html'.format(k))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [28]:
k=15
lda = LdaModel(corpus = corpus, id2word = dic, num_topics = k, alpha = "auto")

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs = 1, sort_topics = False)
pyLDAvis.save_html(lda_display,'vis{}.html'.format(k))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
