In [36]:
# coding: utf-8
"""
gensimを用いたLDA
purityの計算
"""

from gensim import corpora, models, similarities
import csv
import MeCab
import pickle
import collections
import numpy as np

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()

def readdump(path):
    f = open(path, "r")
    arr = pickle.load(f)
    f.close()
    return arr

# f_measureを計算する
def cal_f_measure(list_predict_measure):
    # 生成したクラスタ内のカウント
    dict_predict_cluster = collections.defaultdict(list)
    for row in list_predict_measure:
        dict_predict_cluster[row[0]].append(row[1])
        
    # もとあるクラス内のカウント
    dict_measure_cluster = collections.defaultdict(list)
    for row in list_predict_measure:
        dict_measure_cluster[row[1]].append(row[0])
    
    # local_purityの計算
    list_purity = []
    for row in dict_predict_cluster.items():
        major_class = sorted(collections.Counter(row[1]).items(), key=lambda x: x[1], reverse=True)[0][1]
        class_num = len(row[1])
        list_purity.append([major_class, class_num])
    purity = float(np.sum(zip(*list_purity)[0])) / np.sum(zip(*list_purity)[1])
    print "Purity: ", purity
    
    # inverse_purityの計算
    list_inverse_purity = []
    for row in dict_measure_cluster.items():
        score1 = len(row[1])
        class_number = sorted(collections.Counter(row[1]).items(), key=lambda x: x[1], reverse=True)[0][0]
        score2 = len(dict_predict_cluster[class_number])
        score3 = sorted(collections.Counter(row[1]).items(), key=lambda x: x[1], reverse=True)[0][1]
        score4 = float(score3) * score2 / score1
        list_inverse_purity.append(score4)
    inverse_purity = np.sum(list_inverse_purity) / len(list_predict_measure)
    print "Inverse Purity: ", inverse_purity
    
    print "F-value: ", 2 / (1 / purity + 1 / inverse_purity)

### コーパスの読み込み
* 1行ずつ形態素解析して、ラベルが付いたものと、口コミごとに形態素解析しているのものの2種類を読み込む

In [28]:
list_sep_words_class = readdump("./files/list_sep_words_label.dump")
list_bag_of_words, list_class = zip(*list_sep_words_class)
list_bag_of_words_1 = list(list_bag_of_words)
list_class = list(list_class)

list_bag_of_words_2 = readdump("./files/list_sep_words_per_human.dump")

### LDAにより、トピックの抽出
1. 辞書の作成
2. 辞書を用いてコーパスのマッピングをする
3. マッピングしたコーパスを元にLDA
4. LDAの結果を表示

In [29]:
# 辞書の作成
dictionary = corpora.Dictionary(list_bag_of_words_2)
# コーパスをマッピングし、LDA
corpus_1 = [dictionary.doc2bow(text) for text in list_bag_of_words_1]
corpus_2 = [dictionary.doc2bow(text) for text in list_bag_of_words_2]
lda = models.LdaModel(corpus=corpus_2, id2word=dictionary, num_topics=7)
# p(word|topic)を表示
for i, row in enumerate(lda.show_topics(0)):
    print row

### 文の予測
1. LDAの結果から各文のクラスを予測
2. 予測ラベルと結果ラベルのリストを作成

In [37]:
list_predict_measure = []
error_count = 0
for i in range(len(corpus_1)):
    predict_class = sorted(lda[corpus_1[i]], key=lambda x:x[1], reverse=True)[0][0]
    measure_class = list_class[i]
    try:
        list_predict_measure.append([predict_class, int(measure_class)])
    except:
        error_count += 1
print "エラーしたコーパス", error_count

エラーしたコーパス 7


### 定量評価 (purity)

In [38]:
cal_f_measure(list_predict_measure)

Purity:  0.237924865832
Inverse Purity:  0.272364438237
F-value:  0.253982483694
