In [1]:
import codecs as cd
import gensim
from janome.tokenizer import Tokenizer
from gensim import corpora, models, similarities



In [2]:
filename = 'ZIT_PATENT.txt'
file = cd.open(filename,  'r', 'utf-8')
lines = file.readlines()

t = Tokenizer()
wvs = []

In [3]:
for i, line in enumerate(lines):
    # 一つのページのワードのベクトル
    word_vector = []
    # 短すぎる場合は無視
    if len(line)<30:
        continue
        # 記号以外はベクトル作成
    else:
        tokens = t.tokenize(line)
        
    for token in tokens:
        if token.part_of_speech[:2] == '名詞' or token.part_of_speech[:2] == '動詞':
            word_vector += [token.base_form]

    # データを連結
    wvs += [word_vector]

In [4]:
# 辞書作成
dictionary = corpora.Dictionary(wvs)
dictionary.filter_extremes(no_below=2, no_above=0.3)
dictionary.save_as_text('dict.txt')

# コーパスを作成
corpus = [dictionary.doc2bow(text) for text in wvs]
corpora.MmCorpus.serialize('cop.mm', corpus)

In [5]:
dictionary = gensim.corpora.Dictionary.load_from_text('dict.txt')
corpus = corpora.MmCorpus('cop.mm')

In [6]:
topic_N = 40
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=topic_N, id2word=dictionary)

for i in range(topic_N):
    print('TOPIC:', i, '__', lda.print_topic(i))

TOPIC: 0 __ 0.045*"前記" + 0.013*"手段" + 0.013*"決済" + 0.013*"ステップ" + 0.011*"位置" + 0.011*"車両" + 0.011*"処理" + 0.010*"画像" + 0.010*"出力" + 0.009*"電子"
TOPIC: 1 __ 0.027*"端末" + 0.018*"制御" + 0.018*"５" + 0.017*"４" + 0.015*"通信" + 0.015*"手段" + 0.014*"判定" + 0.012*"機器" + 0.010*"携帯" + 0.010*"送信"
TOPIC: 2 __ 0.045*"前記" + 0.019*"車両" + 0.016*"メイク" + 0.014*"状態" + 0.013*"量" + 0.011*"制御" + 0.010*"通信" + 0.009*"作業" + 0.008*"特定" + 0.008*"機"
TOPIC: 3 __ 0.020*"通信" + 0.018*"前記" + 0.016*"表示" + 0.013*"車両" + 0.012*"手段" + 0.011*"使用" + 0.011*"制御" + 0.011*"通知" + 0.010*"電力" + 0.010*"者"
TOPIC: 4 __ 0.027*"複数" + 0.020*"値" + 0.018*"前記" + 0.016*"表示" + 0.012*"検出" + 0.012*"６" + 0.011*"５" + 0.010*"得る" + 0.009*"制御" + 0.008*"１つ"
TOPIC: 5 __ 0.018*"ユーザ" + 0.014*"表示" + 0.012*"イベント" + 0.012*"商品" + 0.010*"受信" + 0.010*"所定" + 0.009*"生成" + 0.009*"処理" + 0.009*"４" + 0.009*"複数"
TOPIC: 6 __ 0.019*"画像" + 0.015*"前記" + 0.015*"制御" + 0.014*"処理" + 0.013*"照明" + 0.012*"機器" + 0.012*"移動" + 0.010*"表示" + 0.010*"撮像" + 0.010*"４"
TOPIC: 7 __ 0.048*"ユーザ" 