In [34]:
import csv
import json
import subprocess

import numpy as np
import MeCab
from sklearn import feature_extraction

dicdir = subprocess.check_output(['mecab-config', '--dicdir']).decode().strip()
m = MeCab.Tagger(' -d {}/mecab-ipadic-neologd'.format(dicdir))

In [35]:
def load_json(path):
    with open(path, "r") as f:
        json_dict = json.load(f)
    return json_dict

In [36]:
def nodes(text):
    node = m.parseToNode(text)
    while node:
        # XXX: Monkey Patching
        node.features = node.feature.split(',')

        if node.features[0] != 'BOS/EOS':
            yield node

        node = node.next

In [37]:
def wakachi(text):
    def wanted(node):
        return node.features[0] not in (
            '助詞',
            '助動詞',
            '接続詞',
            '動詞',
            '記号',
        )

    for node in nodes(text):
        if wanted(node):
            yield node.surface

In [38]:
def set_stopwords():
    stop_word_file_path = "nlp/stopword_japanese.txt"
    with open(stop_word_file_path, "r") as f:
        stopwords = [line.strip() for line in f]
        stopwords = [ss for ss in stopwords if not ss == u'']
    return stopwords

In [39]:
def exclude_stopword(words, stopwords):
    for word in words:
        # XXX: Destructive
        word = word.lower()
        
        if word == "":
            continue

        if word in stopwords:
            continue

        yield word


In [40]:
def main():
    review_json_path = "../data/coco_reviews.json"
    json_dict = load_json(review_json_path)
    reviews = []
    stopwords = set_stopwords()

    for _, jd in sorted(json_dict.items(), key=lambda t: int(t[0])):
        all_reviews = []
        for review_info in jd["reviews"]:
            wkc = wakachi(review_info["review"])
            esw = exclude_stopword(wkc, stopwords)
            all_reviews.extend(esw)

        reviews.append(' '.join(all_reviews))

    # max_dfとmin_dfによって結果は大きく変わる
    vectorizer = feature_extraction.text.TfidfVectorizer(
        decode_error="ignore",
        strip_accents="unicode",
        lowercase=True,
        analyzer="word",
        max_df=0.5,
        min_df=0.3
    )
    
    X = vectorizer.fit_transform(reviews).toarray()
    index = X.argsort(axis=1)[:, ::-1]
    feature_names = np.array(vectorizer.get_feature_names())
    feature_words = feature_names[index]
    
    # FIXME: functionize me
    with open("../data/nlp/tfidf.csv", "w") as f:
        writer = csv.writer(f, lineterminator="\n")
        writer.writerows(feature_words)
    return feature_words

main()

array([['最後', 'もう', '演技', ..., '映像', '展開', 'キャスト'],
       ['面白かっ', 'よかっ', '印象', ..., '意味', '役者', 'こんな'],
       ['面白かっ', 'よかっ', '印象', ..., '意味', '役者', 'こんな'],
       ...,
       ['演出', '意味', '演技', ..., '人間', '俳優', '最高'],
       ['面白い', '原作', '描写', ..., 'とう', 'ラスト', '人間'],
       ['原作', 'キャスト', '面白かっ', ..., '最高', 'ちょっと', 'こんな']], dtype='<U5')