In [36]:
# coding: utf-8
"""
gensimを用いたLDA
purityの計算
"""

from gensim import corpora, models, similarities
import csv
import MeCab
import pickle
import collections
import numpy as np

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()

def readdump(path):
    f = open(path, "r")
    arr = pickle.load(f)
    f.close()
    return arr

# f_measureを計算する
def cal_f_measure(list_predict_measure):
    # 生成したクラスタ内のカウント
    dict_predict_cluster = collections.defaultdict(list)
    for row in list_predict_measure:
        dict_predict_cluster[row[0]].append(row[1])
        
    # もとあるクラス内のカウント
    dict_measure_cluster = collections.defaultdict(list)
    for row in list_predict_measure:
        dict_measure_cluster[row[1]].append(row[0])
    
    # local_purityの計算
    list_purity = []
    for row in dict_predict_cluster.items():
        major_class = sorted(collections.Counter(row[1]).items(), key=lambda x: x[1], reverse=True)[0][1]
        class_num = len(row[1])
        list_purity.append([major_class, class_num])
    purity = float(np.sum(zip(*list_purity)[0])) / np.sum(zip(*list_purity)[1])
    print "Purity: ", purity
    
    # inverse_purityの計算
    list_inverse_purity = []
    for row in dict_measure_cluster.items():
        major_class = sorted(collections.Counter(row[1]).items(), key=lambda x: x[1], reverse=True)[0][1]
        class_num = len(row[1])
        list_inverse_purity.append([major_class, class_num])
    inverse_purity = float(np.sum(zip(*list_inverse_purity)[0])) / np.sum(zip(*list_inverse_purity)[1])
    print "Inverse Purity: ", inverse_purity
    
    print "F-value: ", 2 / (1 / purity + 1 / inverse_purity)

### コーパスの読み込み
* 1行ずつ形態素解析して、ラベルが付いたものと、口コミごとに形態素解析しているのものの2種類を読み込む

In [44]:
list_sep_words = readcsv("./files/edmunds/edmunds_corpus_for_LDA.csv")

### LDAにより、トピックの抽出
1. 辞書の作成
2. 辞書を用いてコーパスのマッピングをする
3. マッピングしたコーパスを元にLDA
4. LDAの結果を表示

In [45]:
# 辞書の作成
dictionary = corpora.Dictionary(list_sep_words)
# コーパスをマッピングし、LDA
corpus = [dictionary.doc2bow(text) for text in list_sep_words]
lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=6, alpha='auto', eta='auto')

### 正解コーパスの読み込み

In [46]:
list_id_label_sep = readdump("./files/edmunds/list_id_label_sep.dump")

In [47]:
list_predict_measure = []
for row in list_id_label_sep:
    list_tmp = lda.get_document_topics(dictionary.doc2bow(row[2]))
    predict = sorted(list_tmp, key=lambda x: x[1], reverse=True)[0][0]
    list_predict_measure.append([predict, row[1]])

In [48]:
list_id_label_sep[0][2]

['side', 'mirror', 'switch', 'trunk', 'push', 'button', 'right']

### 定量評価 (purity)

In [49]:
cal_f_measure(list_predict_measure)

Purity:  0.306620209059
Inverse Purity:  1.0
F-value:  0.469333333333
