In [1]:
import csv
import json
import subprocess

import numpy as np
import MeCab
from sklearn import feature_extraction

dicdir = subprocess.check_output(['mecab-config', '--dicdir']).decode().strip()
m = MeCab.Tagger(' -d {}/mecab-ipadic-neologd'.format(dicdir))

In [5]:
def load_json(path):
    with open(path, "r") as f:
        json_dict = json.load(f)
    return json_dict

In [6]:
def nodes(text):
    node = m.parseToNode(text)
    while node:
        # XXX: Monkey Patching
        node.features = node.feature.split(',')

        if node.features[0] != 'BOS/EOS':
            yield node

        node = node.next

In [7]:
def wakachi(text):
    def wanted(node):
        return node.features[0] not in (
            '助詞',
            '助動詞',
            '接続詞',
            '動詞',
            '記号',
        )

    for node in nodes(text):
        if wanted(node):
            yield node.surface

In [8]:
def set_stopwords():
    stop_word_file_path = "nlp/stopword_japanese.txt"
    with open(stop_word_file_path, "r") as f:
        stopwords = [line.strip() for line in f]
        stopwords = [ss for ss in stopwords if not ss == u'']
    return stopwords

In [9]:
def exclude_stopword(words, stopwords):
    for word in words:
        # XXX: Destructive
        word = word.lower()
        
        if word == "":
            continue

        if word in stopwords:
            continue

        yield word


In [11]:
def main():
    review_json_path = "../data/coco_reviews.json"
    json_dict = load_json(review_json_path)
    reviews = []
    stopwords = set_stopwords()

    for _, jd in sorted(json_dict.items(), key=lambda t: int(t[0])):
        all_reviews = []
        for review_info in jd["reviews"]:
            wkc = wakachi(review_info["review"])
            esw = exclude_stopword(wkc, stopwords)
            all_reviews.extend(esw)

        reviews.append(' '.join(all_reviews))

    # max_dfとmin_dfによって結果は大きく変わる
    vectorizer = feature_extraction.text.TfidfVectorizer(
        decode_error="ignore",
        strip_accents="unicode",
        lowercase=True,
        analyzer="word",
        max_df=0.5,
        min_df=0.3,
    )
    
    X = vectorizer.fit_transform(reviews).toarray()
    # index = X.argsort(axis=1)[:, ::-1]
    feature_names = np.array(vectorizer.get_feature_names())
    # feature_words = feature_names[index]
    
    # FIXME: functionize me
    with open("../data/nlp/tfidf.csv", "w") as f:
        writer = csv.writer(f, lineterminator="\n")
        writer.writerow(feature_names)
        writer.writerows(X)
    return feature_names, X

main()

(array(['こんな', 'その', 'そんな', 'ちょっと', 'とう', 'とても', 'なく', 'もう', 'もっと', 'やっはり',
        'よかっ', 'よく', 'キャスト', 'ストーリー', 'ラスト', '人間', '俳優', '内容', '印象', '原作',
        '台詞', '好き', '展開', '役者', '意味', '描写', '日本', '映像', '最後', '最高', '本当に',
        '気持ち', '演出', '演技', '物語', '特に', '素晴らしい', '良かっ', '良く', '雰囲気', '面白い',
        '面白かっ'], dtype='<U5'),
 array([[0.2454537 , 0.13853823, 0.08624507, ..., 0.08624507, 0.07596344,
         0.08624507],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.21019285, 0.05931821, 0.        , ..., 0.        , 0.        ,
         0.07385547],
        [0.09106097, 0.10279272, 0.06399219, ..., 0.06399219, 0.42272561,
         0.28796488],
        [0.        , 0.20989803, 0.03733404, ..., 0.03733404, 0.06576658,
         0.2986723 ]]))