In [1]:
import MeCab
import json
import numpy as np
import re

In [2]:
with open("../../data/eigacom_review.json", 'r') as f:
    review_all = json.load(f)

In [3]:
m = MeCab.Tagger("-d /usr/lib64/mecab/dic/mecab-ipadic-neologd")

In [4]:
def wakachi(text):
    LINE_SEPARATOR_PATTERN =  "[\n\r\u2028\u2029\u0085]"
    text = re.sub(LINE_SEPARATOR_PATTERN, ' ', text)
    splitted = ' '.join([
        x.split('\t')[0] for x in m.parse(text.strip()).splitlines()[:-1] 
            if x.split('\t')[1].split(',')[0] not in ['助詞', '助動詞', '接続詞', '動詞', '記号']
    ])
    return splitted

In [5]:
def set_stopwords():
    with open("../nlp/stopword_japanese_lda.txt", "r") as f:
        stopwords = [line.strip() for line in f]
        stopwords = [ss for ss in stopwords if not ss==u'']
       
    return stopwords

stop_words  = set_stopwords()

In [6]:
data_all = []

for key in review_all.keys():
#     print(key)
    reviews = review_all[key]["reviews"]
    data = [] # １つのデータ映画の全てのレビューを繋げる
    for r in reviews:
        text = r["review"].strip()
        
        tmp = []
        for word in wakachi(text).split():
             # ストップワード除去
            if word not in stop_words and not word.isdigit():
                data.append(word)

    data_all.append(data)

In [7]:
len(data_all)

211

In [8]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# SEED = 6
# np.random.seed(SEED)

# 分類するトピック数
topic_n = 10

# 辞書を作成
dictionary = Dictionary(data_all)

# パラメータを設定
# 1000単語以上には増やさない場合: keep_n=1000
dictionary.filter_extremes(no_below=100,# 出現文書数が100回未満の単語を削除
                           no_above=0.5,# 出現文書率が50％より大きい単語を削除
                           ) 
# 各文書をBag-of-Wordsにより文書ベクトルに変換
corpus_bow = [dictionary.doc2bow(text) for text in data_all]

# LDAの学習
lda = LdaModel(corpus=corpus_bow, num_topics=topic_n, id2word=dictionary)

# LDAの学習には時間がかかるので、学習したモデルは保存
# model_pref = 'model/lda'
# lda.save(model_pref)

In [9]:
for tpn in range(topic_n):
    print('トピック', tpn, ': ', lda.print_topic(tpn, topn = 10))

トピック 0 :  0.246*"残念" + 0.162*"無い" + 0.122*"台詞" + 0.117*"展開" + 0.111*"原作" + 0.106*"部分" + 0.085*"一番" + 0.052*"雰囲気"
トピック 1 :  0.413*"原作" + 0.125*"部分" + 0.103*"展開" + 0.099*"無い" + 0.089*"残念" + 0.086*"一番" + 0.047*"雰囲気" + 0.037*"台詞"
トピック 2 :  0.493*"原作" + 0.092*"無い" + 0.078*"展開" + 0.077*"残念" + 0.074*"台詞" + 0.068*"部分" + 0.066*"雰囲気" + 0.053*"一番"
トピック 3 :  0.247*"原作" + 0.176*"部分" + 0.152*"展開" + 0.101*"無い" + 0.099*"残念" + 0.093*"雰囲気" + 0.081*"一番" + 0.052*"台詞"
トピック 4 :  0.261*"部分" + 0.166*"展開" + 0.142*"残念" + 0.112*"無い" + 0.099*"一番" + 0.094*"雰囲気" + 0.089*"台詞" + 0.037*"原作"
トピック 5 :  0.263*"原作" + 0.205*"残念" + 0.116*"部分" + 0.101*"一番" + 0.094*"展開" + 0.092*"無い" + 0.072*"台詞" + 0.056*"雰囲気"
トピック 6 :  0.221*"原作" + 0.191*"無い" + 0.121*"展開" + 0.115*"一番" + 0.103*"雰囲気" + 0.103*"部分" + 0.077*"残念" + 0.069*"台詞"
トピック 7 :  0.570*"原作" + 0.112*"残念" + 0.075*"展開" + 0.068*"一番" + 0.065*"部分" + 0.041*"無い" + 0.039*"台詞" + 0.031*"雰囲気"
トピック 8 :  0.220*"一番" + 0.209*"展開" + 0.156*"部分" + 0.126*"残念" + 0.098*"無い" + 0.090*"台詞" + 0.070*"雰

In [10]:
category_all = []

for id in range(1, 211):
#     print(id)
    cate, weight = sorted(lda[corpus_bow[id]], key=lambda x: x[1], reverse=True)[0]
    category_all.append(cate)

In [11]:
with open("../../data/nominate_movie_meta_data.json", 'r') as fs:
    j = json.load(fs)

In [12]:
prize = []
prize_id = []
for i in j:
    for item in j[i]:
        prize.append(item["prize"])
        if item["prize"]:
            prize_id.append(item["id"])

In [13]:
category = []
for id in prize_id:
    print(id)
    cate, wight = sorted(lda[corpus_bow[id]], key=lambda x: x[1], reverse=True)[0]
    category.append(cate)

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96
101
106
111
116
121
126
131
136
141
146
151
156
161
166
171
176
181
187
192
197
202
207


In [14]:
category_count = []
for i in range(topic_n):
    c = category.count(i)
    category_count.append(c)
    print(i, c)

0 14
1 2
2 4
3 2
4 10
5 1
6 4
7 2
8 2
9 1


In [15]:
category_all_count = []
for i in range(topic_n):
    c = category_all.count(i)
    category_all_count.append(c)
    print(i, c)

0 66
1 3
2 16
3 13
4 24
5 5
6 19
7 24
8 33
9 7


In [16]:
for i, p in enumerate(np.array(category_count)/np.array(category_all_count)):
    print(i, p)

0 0.21212121212121213
1 0.6666666666666666
2 0.25
3 0.15384615384615385
4 0.4166666666666667
5 0.2
6 0.21052631578947367
7 0.08333333333333333
8 0.06060606060606061
9 0.14285714285714285


In [17]:
for i, p in enumerate(np.array(category_count)/np.sum(category_count)):
    print(i, p)

0 0.3333333333333333
1 0.047619047619047616
2 0.09523809523809523
3 0.047619047619047616
4 0.23809523809523808
5 0.023809523809523808
6 0.09523809523809523
7 0.047619047619047616
8 0.047619047619047616
9 0.023809523809523808


In [18]:
for i, p in enumerate(np.array(category_all_count)/np.sum(category_all_count)):
    print(i, p)

0 0.3142857142857143
1 0.014285714285714285
2 0.0761904761904762
3 0.06190476190476191
4 0.11428571428571428
5 0.023809523809523808
6 0.09047619047619047
7 0.11428571428571428
8 0.15714285714285714
9 0.03333333333333333
