In [5]:
# coding: utf-8
"""
Affinity propagationでクラスタリングするためのコード
"""
from sklearn.cluster import AffinityPropagation
import numpy as np
import pickle
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, completeness_score, homogeneity_completeness_v_measure, homogeneity_score, normalized_mutual_info_score
from sklearn.feature_extraction.text import TfidfVectorizer
from library.filer import Filer
from library.dnp import Evaluation
import glob
from scipy import sparse
import collections

In [12]:
def cal_tfidf(list_word, list_test):
    set_word = set([word for row in list_word for word in row])
    V = len(set_word)
    D = len(list_word)
    D_test = len(list_test)
    dict_word_id = {word:i for i, word in enumerate(set_word)}
    
    list_dict_id = [collections.Counter([dict_word_id[word] for word in list_word[d]
                                          if word in dict_word_id]) for d in range(D)]

    list_dict_id_test = [collections.Counter([dict_word_id[word] for word in list_test[d]
                                               if word in dict_word_id]) for d in range(D_test)]
    
    list_row = []
    list_col = []
    list_data = []
    list_row_test = []
    list_col_test = []
    list_data_test = []
    
    for d in range(D):
        for key, value in list_dict_id[d].items():
            list_row.append(d)
            list_col.append(key)
            list_data.append(value)
            
    for d in range(D_test):
        for key, value in list_dict_id_test[d].items():
            list_row_test.append(d)
            list_col_test.append(key)
            list_data_test.append(value)
    
    # idfの計算
    dict_word_num = collections.Counter(list_col)
    list_key, list_value = zip(*dict_word_num.items())
    list_value = np.array(list_value)
    list_idf = np.log(float(D)/list_value) + 1
    
    for i in range(len(list_col)):
        list_data[i] *= list_idf[list_col[i]]
    
    for i in range(len(list_col_test)):
        list_data_test[i] *= list_idf[list_col_test[i]]
                         
    list_d_w = sparse.csr_matrix((list_data, (list_row, list_col)), shape=(D, V), dtype=np.float)
    list_d_w_test = sparse.csr_matrix((list_data_test, (list_row_test, list_col_test)),
                                      shape=(D_test, V), dtype=np.float)                     
                         
    return list_d_w, list_d_w_test

In [13]:
list_filepath = glob.glob('files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/*.txt')
list_filepath.sort()
list_testfile = Filer.readdump('./files/rakuten_corpus/rakuten_corpus_master/testfile/list_sepword_label.dump')
list_testword = [row[0] for row in list_testfile]
list_label = [row[1] for row in list_testfile]

outputpath = './files/result/rakuten/experiment4/Affinity.txt'
removepath = 'files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/'

In [14]:
for path in list_filepath:
    print path
    path_rev = path.replace(removepath, '')
    Filer.writetxt([path_rev], outputpath)
    
    list_word = Filer.readtxt(path)
    list_d_w, list_d_w_test = cal_tfidf(list_word, list_testword)
    
    affinity = AffinityPropagation()
    affinity.fit(list_d_w)
    
    K = len(set(affinity.labels_))
    
    list_predict = affinity.predict(list_d_w_test)

    eva = Evaluation()
    dict_result = eva.cal_f_measure(list_predict, list_label)
        
    # NMI
    NMI = normalized_mutual_info_score(list_label, list_predict)
    # vm
    H, C, VM = homogeneity_completeness_v_measure(list_label, list_predict)
    # ARI
    ARI = adjusted_rand_score(list_label, list_predict)
    # AMI
    AMI = adjusted_mutual_info_score(list_label, list_predict)
        
    Filer.writetxt([' '.join([str(dict_result['purity']),str(dict_result['invpurity']),str(dict_result['fvalue']),
                              str(NMI), str(H), str(C), str(VM), str(ARI), str(AMI), str(K)])],
                    outputpath)

files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_005105.txt
