In [98]:
import MeCab
import json
import numpy as np
import pandas as pd
import re

In [99]:
with open("../../data/eigacom_review.json", 'r') as f:
    review_all = json.load(f)

In [100]:
m = MeCab.Tagger("-d /usr/lib64/mecab/dic/mecab-ipadic-neologd")

In [101]:
def wakachi(text):
    LINE_SEPARATOR_PATTERN =  "[\n\r\u2028\u2029\u0085]"
    text = re.sub(LINE_SEPARATOR_PATTERN, ' ', text)
    splitted = ' '.join([
        x.split('\t')[0] for x in m.parse(text.strip()).splitlines()[:-1] 
            if x.split('\t')[1].split(',')[0] not in ['助詞', '助動詞', '接続詞', '動詞', '記号']
    ])
    return splitted

In [102]:
def set_stopwords():
    with open("../nlp/stopword_lda.txt", "r") as f:
        stopwords = [line.strip() for line in f]
        stopwords = [ss for ss in stopwords if not ss==u'']
       
    return stopwords

stop_words  = set_stopwords()

In [103]:
data_all = []

for key in review_all.keys():
#     print(key)
    reviews = review_all[key]["reviews"]
    data = [] # １つのデータ映画の全てのレビューを繋げる
    for r in reviews:
        text = r["review"].strip()
        
        tmp = []
        for word in wakachi(text).split():
             # ストップワード除去
            if word not in stop_words and not word.isdigit():
                data.append(word)

    data_all.append(data)

In [104]:
len(data_all)

211

In [115]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# SEED = 6
# np.random.seed(SEED)

# 分類するトピック数
topic_n = 4

# 辞書を作成
dictionary = Dictionary(data_all)

# パラメータを設定
# 1000単語以上には増やさない場合: keep_n=1000
dictionary.filter_extremes(no_below=100,# 出現文書数が100回未満の単語を削除
                           no_above=0.5,# 出現文書率が50％より大きい単語を削除
                           ) 
# 各文書をBag-of-Wordsにより文書ベクトルに変換
corpus_bow = [dictionary.doc2bow(text) for text in data_all]

# LDAの学習
lda = LdaModel(corpus=corpus_bow, num_topics=topic_n, id2word=dictionary)

# LDAの学習には時間がかかるので、学習したモデルは保存
# model_pref = 'model/lda'
# lda.save(model_pref)

In [116]:
for tpn in range(topic_n):
    print('トピック', tpn, ': ', lda.print_topic(tpn, topn = 10))

トピック 0 :  0.469*"原作" + 0.092*"展開" + 0.090*"残念" + 0.086*"役者" + 0.071*"俳優" + 0.060*"無い" + 0.051*"一番" + 0.045*"台詞" + 0.037*"雰囲気"
トピック 1 :  0.194*"役者" + 0.189*"俳優" + 0.138*"展開" + 0.114*"原作" + 0.108*"一番" + 0.089*"無い" + 0.074*"残念" + 0.052*"雰囲気" + 0.042*"台詞"
トピック 2 :  0.171*"無い" + 0.132*"残念" + 0.127*"役者" + 0.116*"一番" + 0.107*"展開" + 0.095*"俳優" + 0.093*"原作" + 0.084*"台詞" + 0.075*"雰囲気"
トピック 3 :  0.217*"原作" + 0.180*"雰囲気" + 0.112*"展開" + 0.108*"台詞" + 0.093*"俳優" + 0.090*"一番" + 0.074*"残念" + 0.067*"役者" + 0.058*"無い"


In [117]:
frame = []

for id in range(1, 212):
    frame.append([id] + [0 for i in range(topic_n)])
    
for id in range(1, 212):
    for num , prob  in lda[corpus_bow[id-1]]:
        frame[id-1][num + 1] = prob

In [118]:
df_columns = ["id"]
for i in range(topic_n):
    df_columns.append("topic_{}".format(str(i)))

df = pd.DataFrame(frame, columns=df_columns)
df = df.set_index('id')
df

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.022205,0.241884,0.022664,0.713247
2,0.250000,0.250000,0.250000,0.250000
3,0.250000,0.250000,0.250000,0.250000
4,0.015955,0.952517,0.015996,0.015532
5,0.250000,0.250000,0.250000,0.250000
6,0.760546,0.181046,0.029300,0.029108
7,0.127010,0.127954,0.129530,0.615506
8,0.864166,0.045183,0.045607,0.045044
9,0.250000,0.250000,0.250000,0.250000
10,0.043968,0.864987,0.046218,0.044827


In [119]:
df.to_pickle("./topics{}.pkl".format(topic_n))

In [66]:
category_all = []

for id in range(1, 211):
#     print(id)
    cate, weight = sorted(lda[corpus_bow[id]], key=lambda x: x[1], reverse=True)[0]
    category_all.append(cate)

In [67]:
with open("../../data/nominate_movie_meta_data.json", 'r') as fs:
    j = json.load(fs)

In [54]:
prize = []
prize_id = []
for i in j:
    for item in j[i]:
        prize.append(item["prize"])
        if item["prize"]:
            prize_id.append(item["id"])

In [55]:
category = []
for id in prize_id:
    print(id)
    cate, wight = sorted(lda[corpus_bow[id]], key=lambda x: x[1], reverse=True)[0]
    category.append(cate)

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96
101
106
111
116
121
126
131
136
141
146
151
156
161
166
171
176
181
187
192
197
202
207


In [56]:
category_count = []
for i in range(topic_n):
    c = category.count(i)
    category_count.append(c)
    print(i, c)

0 9
1 0
2 4
3 1
4 0
5 0
6 4
7 1
8 3
9 4
10 2
11 1
12 2
13 0
14 0
15 0
16 4
17 1
18 2
19 4


In [57]:
category_all_count = []
for i in range(topic_n):
    c = category_all.count(i)
    category_all_count.append(c)
    print(i, c)

0 45
1 4
2 16
3 10
4 7
5 1
6 15
7 2
8 12
9 20
10 6
11 3
12 6
13 0
14 1
15 0
16 16
17 6
18 6
19 34


In [58]:
for i, p in enumerate(np.array(category_count)/np.array(category_all_count)):
    print(i, p)

0 0.2
1 0.0
2 0.25
3 0.1
4 0.0
5 0.0
6 0.26666666666666666
7 0.5
8 0.25
9 0.2
10 0.3333333333333333
11 0.3333333333333333
12 0.3333333333333333
13 nan
14 0.0
15 nan
16 0.25
17 0.16666666666666666
18 0.3333333333333333
19 0.11764705882352941


  """Entry point for launching an IPython kernel.


In [59]:
for i, p in enumerate(np.array(category_count)/np.sum(category_count)):
    print(i, p)

0 0.21428571428571427
1 0.0
2 0.09523809523809523
3 0.023809523809523808
4 0.0
5 0.0
6 0.09523809523809523
7 0.023809523809523808
8 0.07142857142857142
9 0.09523809523809523
10 0.047619047619047616
11 0.023809523809523808
12 0.047619047619047616
13 0.0
14 0.0
15 0.0
16 0.09523809523809523
17 0.023809523809523808
18 0.047619047619047616
19 0.09523809523809523


In [60]:
for i, p in enumerate(np.array(category_all_count)/np.sum(category_all_count)):
    print(i, p)

0 0.21428571428571427
1 0.01904761904761905
2 0.0761904761904762
3 0.047619047619047616
4 0.03333333333333333
5 0.004761904761904762
6 0.07142857142857142
7 0.009523809523809525
8 0.05714285714285714
9 0.09523809523809523
10 0.02857142857142857
11 0.014285714285714285
12 0.02857142857142857
13 0.0
14 0.004761904761904762
15 0.0
16 0.0761904761904762
17 0.02857142857142857
18 0.02857142857142857
19 0.1619047619047619
