In [126]:
# coding: utf-8
"""
tf-idfによるトピックの抽出
"""
from gensim import corpora, models, similarities
import csv
import MeCab
import numpy as np
from sklearn import cluster
import collections

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()
    
def parsing(sentence):
    mecab = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    res = mecab.parseToNode(sentence)
    list_words = []
    while res:
        features = res.feature.split(",")
        if (features[0] == "名詞" and features[1] in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]) or features[0] == "形容詞":
            if features[6] == "*":
                list_words.append(res.surface)
            else:
                list_words.append(features[6])
        res = res.next
    return list_words

# 正規化する関数
def normalize(vector):
    tmp = np.sqrt(np.sum(vector * vector))
    if tmp == 0:
        return vector
    else:
        return vector / tmp

形態素解析して、bag of words を作成

In [112]:
list_corpus = readcsv("./files/file_1.csv")
list_corpuses = [row[1].split("。") for row in list_corpus]
list_corpuses = [sentence for row in list_corpuses for sentence in row]
list_bag_of_words = [parsing(sentence) for sentence in list_corpuses]

辞書をダウンロードして、コーパスにidを振る

In [113]:
dictionary = corpora.Dictionary.load_from_text("./files/dictionary1.txt")
corpus = [dictionary.doc2bow(text) for text in list_bag_of_words]

tfidfモデルを作成して、計算、スパースマトリックスを変換する

In [115]:
tfidf = models.TfidfModel(corpus)
tfidf_corpus = [tfidf[corpus[i]] for i in range(len(corpus))]
tfidf_corpus_rev = np.zeros((3224, 1109))
for i, row in enumerate(tfidf_corpus):
    for j in row:
        tfidf_corpus_rev[i][j[0]] = j[1]
tfidf_corpus_rev = [normalize(row) for row in tfidf_corpus_rev]

階層クラスタリングにより、tf-idfをクラスタリング

In [117]:
hieral = cluster.AgglomerativeClustering(n_clusters=13, affinity='euclidean', linkage='ward')
result = hieral.fit_predict(tfidf_corpus_rev)

作成したクラスタの中でbag-of-wordsを作成する

In [119]:
dict_cluster_word = collections.defaultdict(list)
for i, num in enumerate(result):
    for word in list_bag_of_words[i]:
        dict_cluster_word[num].append(word)

クラスタ内の単語を頻度順に表示

In [123]:
num = 13
list_word = sorted(collections.Counter(dict_cluster_word[num]).items(),
                   key=lambda x: x[1], reverse=True)
for word in list_word:
    print word[0]