In [1]:
import csv
import json
import subprocess

import numpy as np
import MeCab
from sklearn import feature_extraction

dicdir = subprocess.check_output(['mecab-config', '--dicdir']).decode().strip()
m = MeCab.Tagger(' -d {}/mecab-ipadic-neologd'.format(dicdir))

In [5]:
def load_json(path):
    with open(path, "r") as f:
        json_dict = json.load(f)
    return json_dict

In [6]:
def nodes(text):
    node = m.parseToNode(text)
    while node:
        # XXX: Monkey Patching
        node.features = node.feature.split(',')

        if node.features[0] != 'BOS/EOS':
            yield node

        node = node.next

In [7]:
def wakachi(text):
    def wanted(node):
        return node.features[0] not in (
            '助詞',
            '助動詞',
            '接続詞',
            '動詞',
            '記号',
        )

    for node in nodes(text):
        if wanted(node):
            yield node.surface

In [8]:
def set_stopwords():
    stop_word_file_path = "nlp/stopword_japanese.txt"
    with open(stop_word_file_path, "r") as f:
        stopwords = [line.strip() for line in f]
        stopwords = [ss for ss in stopwords if not ss == u'']
    return stopwords

In [9]:
def exclude_stopword(words, stopwords):
    for word in words:
        # XXX: Destructive
        word = word.lower()
        
        if word == "":
            continue

        if word in stopwords:
            continue

        yield word


In [12]:
def main():
    review_json_path = "../data/coco_reviews.json"
    json_dict = load_json(review_json_path)
    reviews = []
    stopwords = set_stopwords()

    for _, jd in sorted(json_dict.items(), key=lambda t: int(t[0])):
        all_reviews = []
        for review_info in jd["reviews"]:
            wkc = wakachi(review_info["review"])
            esw = exclude_stopword(wkc, stopwords)
            all_reviews.extend(esw)

        reviews.append(' '.join(all_reviews))

    # max_dfとmin_dfによって結果は大きく変わる
    vectorizer = feature_extraction.text.TfidfVectorizer(
        decode_error="ignore",
        strip_accents="unicode",
        lowercase=True,
        analyzer="word",
        max_df=0.5,
        min_df=0.3,
    )
    
    X = vectorizer.fit_transform(reviews).toarray()
    # index = X.argsort(axis=1)[:, ::-1]
    feature_names = np.array(vectorizer.get_feature_names())
    # feature_words = feature_names[index]
    
    # FIXME: functionize me
    with open("../data/nlp/tfidf.csv", "w") as f:
        writer = csv.writer(f, lineterminator="\n")
        writer.writerow(feature_names)
        writer.writerows(X)
    return feature_names, X

feature_names, X = main()

In [23]:
import pandas as pd
df_X = pd.DataFrame(X)
df_X.columns = feature_names
df_X.index += 1

In [24]:
df_X

Unnamed: 0,こんな,その,そんな,ちょっと,とう,とても,なく,もう,もっと,やっはり,...,演出,演技,物語,特に,素晴らしい,良かっ,良く,雰囲気,面白い,面白かっ
1,0.245454,0.138538,0.086245,0.000000,0.000000,0.000000,0.000000,0.411729,0.000000,0.083422,...,0.000000,0.348285,0.149244,0.086245,0.000000,0.283371,0.000000,0.086245,0.075963,0.086245
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.279087,0.000000,0.147094,0.139543,0.145128,0.000000,0.130338,0.280888,0.000000,0.000000,...,0.130338,0.000000,0.000000,0.000000,0.000000,0.120825,0.145128,0.000000,0.000000,0.147094
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.266044,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.295286,0.000000,0.000000,0.000000,0.291755,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.193050,0.240361,0.000000,0.237149,0.000000,0.212980,0.229494,0.000000,0.000000,...,0.425961,0.388261,0.000000,0.000000,0.214269,0.197436,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.159020,0.724729,0.000000,0.000000,0.000000,0.147414,0.000000,0.000000,0.000000,0.000000
