In [40]:
# coding: utf-8
"""
tf-idfによるトピックの抽出
k-meansによるクラスタリング
精度の検証
"""
from gensim import corpora, models, similarities
import csv
import MeCab
import numpy as np
import pickle
from sklearn import cluster
import collections

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()
    
def readdump(path):
    f = open(path, "r")
    arr = pickle.load(f)
    f.close()
    return arr
    
def parsing(sentence):
    mecab = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    res = mecab.parseToNode(sentence)
    list_words = []
    while res:
        features = res.feature.split(",")
        if (features[0] == "名詞" and features[1] in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]) or features[0] == "形容詞":
            if features[6] == "*":
                list_words.append(res.surface)
            else:
                list_words.append(features[6])
        res = res.next
    return list_words

# 正規化する関数
def normalize(vector):
    tmp = np.sqrt(np.sum(vector * vector))
    if tmp == 0:
        return vector
    else:
        return vector / tmp
    
# f_measureを計算する
def cal_f_measure(list_predict_measure):
    # 生成したクラスタ内のカウント
    dict_predict_cluster = collections.defaultdict(list)
    for row in list_predict_measure:
        dict_predict_cluster[row[0]].append(row[1])
        
    # もとあるクラス内のカウント
    dict_measure_cluster = collections.defaultdict(list)
    for row in list_predict_measure:
        dict_measure_cluster[row[1]].append(row[0])
    
    # local_purityの計算
    list_purity = []
    for row in dict_predict_cluster.items():
        major_class = sorted(collections.Counter(row[1]).items(), key=lambda x: x[1], reverse=True)[0][1]
        class_num = len(row[1])
        list_purity.append([major_class, class_num])
    purity = float(np.sum(zip(*list_purity)[0])) / np.sum(zip(*list_purity)[1])
    print "Purity: ", purity
    
    # inverse_purityの計算
    list_inverse_purity = []
    for row in dict_measure_cluster.items():
        major_class = sorted(collections.Counter(row[1]).items(), key=lambda x: x[1], reverse=True)[0][1]
        class_num = len(row[1])
        list_inverse_purity.append([major_class, class_num])
    inverse_purity = float(np.sum(zip(*list_inverse_purity)[0])) / np.sum(zip(*list_inverse_purity)[1])
    print "Inverse Purity: ", inverse_purity
    
    print "F-value: ", 2 / (1 / purity + 1 / inverse_purity)

形態素解析して、bag of words を作成

In [41]:
list_corpus = readdump("./files/list_sep_words_label.dump")
list_corpus, list_label = zip(*list_corpus)
list_corpus = list(list_corpus)
list_label = list(list_label)

In [42]:
collections.Counter(list_label)

Counter({'0': 798, '5': 420, '3': 414, '1': 405, '4': 400, '2': 379, '6': 345, '7': 193, '': 1, '760\xe5\x86\x86\xe3\x81\xae\xe6\x94\xaf\xe6\x89\x95\xe3\x81\xa7\xe3\x80\x81\xe5\xae\xb6\xe8\xb3\x83\xe3\x80\x81\xe7\xae\xa1\xe7\x90\x86\xe8\xb2\xbb\xe3\x80\x81\xe7\x94\x9f\xe6\xb4\xbb\xe3\x82\xb5\xe3\x83\xbc\xe3\x83\x93\xe3\x82\xb9\xe8\xb2\xbb\xe3\x80\x81\xe9\xa3\x9f\xe8\xb2\xbb\xe3\x81\x8c\xe5\x90\xab\xe3\x81\xbf\xe5\x85\xa5\xe3\x82\x8a\xe3\x81\xa7\xe3\x81\x99': 1, '880\xe4\xb8\x87\xe5\x86\x86\xe3\x81\xa83': 1, '500\xe5\x86\x86\xe3\x81\xae\xe5\xae\xb6\xe8\xb3\x83\xe3\x81\xa864': 1, '000\xe5\x86\x86\xe3\x81\xab\xe5\x88\xa5\xe9\x80\x9410': 1, '000\xe5\x86\x86\xe3\x82\x92\xe8\xb6\x85\xe3\x81\x88\xe3\x82\x8b\xe7\xa8\x8b\xe5\xba\xa6\xe3\x81\xab\xe3\x81\xaa\xe3\x82\x8b\xe3\x81\xae\xe3\x81\xa7\xe3\x80\x81\xe8\x80\x81\xe4\xba\xba\xe3\x83\x9b\xe3\x83\xbc\xe3\x83\xa0\xe3\x81\xae\xe6\x96\x99\xe9\x87\x91\xe3\x81\xa8\xe3\x81\x97\xe3\x81\xa6\xe3\x81\xaf\xe3\x82\x84\xe3\x82\x84\xe9\xab\x98\xe3\x82\x81\xe

辞書を作成して、コーパスにidを振る

In [43]:
dictionary = corpora.Dictionary(list_corpus)
list_corpus_id = [dictionary.doc2bow(text) for text in list_corpus]

tfidfモデルを作成して、計算、スパースマトリックスを変換する

In [44]:
tfidf = models.TfidfModel(list_corpus_id)
tfidf_corpus = [tfidf[list_corpus_id[i]] for i in range(len(list_corpus_id))]
tfidf_corpus_rev = np.zeros((len(list_corpus_id), len(dictionary.token2id)))
for i, row in enumerate(tfidf_corpus):
    for j in row:
        tfidf_corpus_rev[i][j[0]] = j[1]
tfidf_corpus_rev = [normalize(row) for row in tfidf_corpus_rev]

k-meansにより、tf-idfをクラスタリング

In [49]:
kmeans = cluster.KMeans(n_clusters=7)
kmeans.fit(tfidf_corpus_rev)
result = kmeans.labels_

In [50]:
collections.Counter(result)

Counter({2: 1979, 0: 487, 4: 250, 1: 218, 6: 213, 3: 149, 5: 65})

予測ラベル、正解ラベルのリストを作成する

In [51]:
print len(result)
print len(list_label)
list_predict_label = []
for i in range(len(result)):
    try:
        list_predict_label.append([result[i], int(list_label[i])])
    except:
        pass
print len(list_predict_label)

3361
3361
3354


クラスタ内の単語を頻度順に表示

In [52]:
cal_f_measure(list_predict_label)

Purity:  0.355694692904
Inverse Purity:  0.590339892665
F-value:  0.443917738386
