In [79]:
import MeCab
import json
import numpy as np
import pandas as pd
import re

In [80]:
with open("../../data/eigacom_review.json", 'r') as f:
    review_all = json.load(f)

In [81]:
m = MeCab.Tagger("-d /usr/lib64/mecab/dic/mecab-ipadic-neologd")

In [82]:
def wakachi(text):
    LINE_SEPARATOR_PATTERN =  "[\n\r\u2028\u2029\u0085]"
    text = re.sub(LINE_SEPARATOR_PATTERN, ' ', text)
    splitted = ' '.join([
        x.split('\t')[0] for x in m.parse(text.strip()).splitlines()[:-1] 
            if x.split('\t')[1].split(',')[0] not in ['助詞', '助動詞', '接続詞', '動詞', '記号']
    ])
    return splitted

In [83]:
def set_stopwords():
    with open("../nlp/stopword_lda.txt", "r") as f:
        stopwords = [line.strip() for line in f]
        stopwords = [ss for ss in stopwords if not ss==u'']
       
    return stopwords

stop_words  = set_stopwords()

In [84]:
data_all = []

for key in review_all.keys():
#     print(key)
    reviews = review_all[key]["reviews"]
    data = [] # １つのデータ映画の全てのレビューを繋げる
    for r in reviews:
        text = r["review"].strip()
        
        tmp = []
        for word in wakachi(text).split():
             # ストップワード除去
            if word not in stop_words and not word.isdigit():
                data.append(word)

    data_all.append(data)

In [85]:
len(data_all)

211

In [86]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# SEED = 6
# np.random.seed(SEED)

# 分類するトピック数
topic_n = 10

# 辞書を作成
dictionary = Dictionary(data_all)

# パラメータを設定
# 1000単語以上には増やさない場合: keep_n=1000
dictionary.filter_extremes(no_below=100,# 出現文書数が100回未満の単語を削除
                           no_above=0.5,# 出現文書率が50％より大きい単語を削除
                           ) 
# 各文書をBag-of-Wordsにより文書ベクトルに変換
corpus_bow = [dictionary.doc2bow(text) for text in data_all]

# LDAの学習
lda = LdaModel(corpus=corpus_bow, num_topics=topic_n, id2word=dictionary)

# LDAの学習には時間がかかるので、学習したモデルは保存
# model_pref = 'model/lda'
# lda.save(model_pref)

In [87]:
for tpn in range(topic_n):
    print('トピック', tpn, ': ', lda.print_topic(tpn, topn = 10))

トピック 0 :  0.340*"原作" + 0.169*"無い" + 0.161*"役者" + 0.096*"残念" + 0.081*"雰囲気" + 0.043*"俳優" + 0.042*"台詞" + 0.040*"展開" + 0.028*"一番"
トピック 1 :  0.195*"展開" + 0.165*"無い" + 0.163*"残念" + 0.136*"一番" + 0.105*"俳優" + 0.082*"台詞" + 0.079*"役者" + 0.058*"雰囲気" + 0.015*"原作"
トピック 2 :  0.453*"原作" + 0.093*"雰囲気" + 0.082*"無い" + 0.078*"俳優" + 0.076*"残念" + 0.061*"台詞" + 0.060*"一番" + 0.055*"展開" + 0.042*"役者"
トピック 3 :  0.208*"原作" + 0.161*"役者" + 0.115*"俳優" + 0.110*"展開" + 0.100*"無い" + 0.098*"一番" + 0.086*"残念" + 0.062*"台詞" + 0.060*"雰囲気"
トピック 4 :  0.219*"原作" + 0.176*"残念" + 0.140*"雰囲気" + 0.126*"展開" + 0.111*"俳優" + 0.097*"無い" + 0.047*"役者" + 0.042*"一番" + 0.041*"台詞"
トピック 5 :  0.246*"原作" + 0.182*"雰囲気" + 0.110*"無い" + 0.100*"役者" + 0.089*"台詞" + 0.074*"一番" + 0.070*"残念" + 0.065*"俳優" + 0.064*"展開"
トピック 6 :  0.414*"原作" + 0.153*"残念" + 0.085*"役者" + 0.085*"俳優" + 0.081*"展開" + 0.051*"一番" + 0.048*"無い" + 0.048*"台詞" + 0.035*"雰囲気"
トピック 7 :  0.530*"原作" + 0.099*"展開" + 0.077*"俳優" + 0.068*"残念" + 0.055*"役者" + 0.048*"雰囲気" + 0.048*"無い" + 0.042*"一番" + 0.0

In [94]:
frame = []

for id in range(1, 211):
    frame.append([id] + [0 for i in range(topic_n)])
    
for id in range(1, 211):
    for topic_num , prob  in lda[corpus_bow[id]]:
        frame[id-1][topic_num + 1] = prob

In [100]:
df = pd.DataFrame(frame, columns=['id', 'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9'])
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.947041,0.0
4,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5,0.690683,0.011113,0.011115,0.011115,0.011114,0.011114,0.011115,0.220401,0.011116,0.011115


In [None]:
df.to_pickle("./topics.pkl")

In [66]:
category_all = []

for id in range(1, 211):
#     print(id)
    cate, weight = sorted(lda[corpus_bow[id]], key=lambda x: x[1], reverse=True)[0]
    category_all.append(cate)

In [67]:
with open("../../data/nominate_movie_meta_data.json", 'r') as fs:
    j = json.load(fs)

In [54]:
prize = []
prize_id = []
for i in j:
    for item in j[i]:
        prize.append(item["prize"])
        if item["prize"]:
            prize_id.append(item["id"])

In [55]:
category = []
for id in prize_id:
    print(id)
    cate, wight = sorted(lda[corpus_bow[id]], key=lambda x: x[1], reverse=True)[0]
    category.append(cate)

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96
101
106
111
116
121
126
131
136
141
146
151
156
161
166
171
176
181
187
192
197
202
207


In [56]:
category_count = []
for i in range(topic_n):
    c = category.count(i)
    category_count.append(c)
    print(i, c)

0 9
1 0
2 4
3 1
4 0
5 0
6 4
7 1
8 3
9 4
10 2
11 1
12 2
13 0
14 0
15 0
16 4
17 1
18 2
19 4


In [57]:
category_all_count = []
for i in range(topic_n):
    c = category_all.count(i)
    category_all_count.append(c)
    print(i, c)

0 45
1 4
2 16
3 10
4 7
5 1
6 15
7 2
8 12
9 20
10 6
11 3
12 6
13 0
14 1
15 0
16 16
17 6
18 6
19 34


In [58]:
for i, p in enumerate(np.array(category_count)/np.array(category_all_count)):
    print(i, p)

0 0.2
1 0.0
2 0.25
3 0.1
4 0.0
5 0.0
6 0.26666666666666666
7 0.5
8 0.25
9 0.2
10 0.3333333333333333
11 0.3333333333333333
12 0.3333333333333333
13 nan
14 0.0
15 nan
16 0.25
17 0.16666666666666666
18 0.3333333333333333
19 0.11764705882352941


  """Entry point for launching an IPython kernel.


In [59]:
for i, p in enumerate(np.array(category_count)/np.sum(category_count)):
    print(i, p)

0 0.21428571428571427
1 0.0
2 0.09523809523809523
3 0.023809523809523808
4 0.0
5 0.0
6 0.09523809523809523
7 0.023809523809523808
8 0.07142857142857142
9 0.09523809523809523
10 0.047619047619047616
11 0.023809523809523808
12 0.047619047619047616
13 0.0
14 0.0
15 0.0
16 0.09523809523809523
17 0.023809523809523808
18 0.047619047619047616
19 0.09523809523809523


In [60]:
for i, p in enumerate(np.array(category_all_count)/np.sum(category_all_count)):
    print(i, p)

0 0.21428571428571427
1 0.01904761904761905
2 0.0761904761904762
3 0.047619047619047616
4 0.03333333333333333
5 0.004761904761904762
6 0.07142857142857142
7 0.009523809523809525
8 0.05714285714285714
9 0.09523809523809523
10 0.02857142857142857
11 0.014285714285714285
12 0.02857142857142857
13 0.0
14 0.004761904761904762
15 0.0
16 0.0761904761904762
17 0.02857142857142857
18 0.02857142857142857
19 0.1619047619047619
