In [57]:
import pandas as pd
import numpy as np
from datetime import datetime
import MeCab
import ipadic
import os

In [22]:
class Feature_extractor:

    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)

        self.save_folder = "features"
        self.features = {}

    def date_feature(self):
        dates = self.data["レビュー登録日時"].to_numpy()
        data_format = "%Y-%m-%d %H:%M"
        end_day = datetime.strptime("2019-12-31 23:59",data_format)
        self.features["df"] = [(end_day - datetime.strptime(date,data_format)).days for date in dates]

    def is_title(self):
        self.features["it"] = self.data["レビュータイトル"].isnull().to_numpy().astype(int)

    def is_mokuteki(self):
        self.features["im"] = self.data["目的"].isnull().to_numpy().astype(int)

    def is_ti(self):
        self.features["itsu"] = self.data["使い道"].isnull().to_numpy().astype(int)

    def is_hin(self):
        self.features["ih"] = self.data["頻度"].isnull().to_numpy().astype(int)

    def title_wc(self):
        self.tc = []
        wakati = MeCab.Tagger("-Owakati")
        file_name = "title_count.txt"
        if file_name not in os.listdir(self.save_folder):
            review = self.data["レビュータイトル"].to_numpy()
            for line in tqdm(review):
                if line is np.nan:
                    out = 0
                else:
                    out = len(wakati.parse(line).split())
                self.tc.append(out)
            with open(os.path.join(self.save_folder,file_name), "w") as file:
                for line in self.tc:
                    file.write(str(line))
                    file.write("\n")
        else:
            with open(os.path.join(self.save_folder,file_name), "r") as file:
                self.tc = [int(line.replace("\n", "")) for line in file.readlines()]

        self.features["tc"] = self.tc

    def word_count(self):
        self.wc = []
        wakati = MeCab.Tagger("-Owakati")
        file_name = "word_count.txt"
        if file_name not in os.listdir(self.save_folder):
            with open(os.path.join(self.save_folder,file_name), "w") as file:
                review = self.data["レビュー内容"].to_numpy()
                for line in tqdm(review):
                    out = len(wakati.parse(line).split())
                    file.write(str(out))
                    file.write("\n")
                    self.wc.append(out)
        else:
            with open(os.path.join(self.save_folder,file_name), "r") as file:
                self.wc = [int(line.replace("\n", "")) for line in file.readlines()]

        self.features["wc"] = self.wc

    def score(self):
        self.features['score'] = self.data["評価ポイント"].to_numpy()
        # self.features['score'] = s(self.data['評価ポイント'] == 1).astype(int).to_numpy()
    def encode_mokuteki(self):
        return pd.get_dummies(self.data["目的"]).astype(int)

    def encode_hindo(self):
        return pd.get_dummies(self.data["頻度"]).astype(int)

    def encode_tsu(self):
        return pd.get_dummies(self.data["使い道"]).astype(int)

    def encode_denpou(self):
        # cate = self.data["店舗名"].to_frame()
        # le = LabelEncoder()
        # a = le.fit_transform(cate).flatten()
        # return pd.DataFrame({"denpou":a})
        self.features['denpo'] = self.data['店舗名'].map(self.data['店舗名'].value_counts()).to_numpy()

    def encode_goods(self):
        self.features['goods'] = self.data['商品名'].map(self.data['商品名'].value_counts()).to_numpy()

    def encode_users(self):
        self.features['user'] = self.data['投稿者ID'].map(self.data['投稿者ID'].value_counts()).to_numpy()

    def encode_genre(self):
        self.features['商品ジャンルID'] = self.data['商品ジャンルID'].map(self.data['商品ジャンルID'].value_counts()).to_numpy()

    def get_y(self,cut_off:int):
        out = self.data["参考になった数"].to_numpy()
        return np.where(out > cut_off, 1, 0)

    def to_frame(self, include_y=False):
        # self.date_feature()
        self.word_count() #单词数
        self.title_wc() #标题单词数
        self.is_title() #是否有标题
        self.is_mokuteki() #目的
        self.is_ti() #使い道
        self.is_hin() #頻度
        self.score() #评论分数
        self.encode_goods() #商品名
        self.encode_users() #用户
        self.encode_genre() #类别
        self.encode_denpou() #店铺
        encoded_hindo = self.encode_hindo() #使用频度
        encoded_mokuteki = self.encode_mokuteki() #使用目的
        encoded_tsu = self.encode_tsu() #使用方法

        if include_y:
            self.features['y'] = self.get_y(cut_off=1)

        data = pd.DataFrame(self.features)
        data = pd.concat([data, encoded_mokuteki,encoded_hindo, encoded_tsu, encoded_tsu], axis = 1)
        return data

In [23]:
path = "/home/hc/[NII-IDR] 楽天市場データ/review/sample/sample_from_raw.csv"
extractor = Feature_extractor(path)
X = extractor.to_frame(include_y=True)

In [None]:
def hinshi_dis(review):
    pass

In [58]:
CHASEN_ARGS = r' -F "%m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n"'
CHASEN_ARGS += r' -U "%m\t%m\t%m\t%F-[0,1,2,3]\t\t\n"'
wakati = MeCab.Tagger(ipadic.MECAB_ARGS + CHASEN_ARGS)

In [59]:
text = extractor.data['レビュー内容'][0]
wakati.parse(text)

'プリンタ\tプリンタ\tプリンタ\t名詞-一般\t\t\nを\tヲ\tを\t助詞-格助詞-一般\t\t\n購入\tコウニュウ\t購入\t名詞-サ変接続\t\t\nし\tシ\tする\t動詞-自立\tサ変・スル\t連用形\nた\tタ\tた\t助動詞\t特殊・タ\t基本形\nの\tノ\tの\t名詞-非自立-一般\t\t\nが\tガ\tが\t助詞-格助詞-一般\t\t\n2\t2\t2\t名詞-数\t\t\n月\tツキ\t月\t名詞-一般\t\t\n。\t。\t。\t記号-句点\t\t\nエプソン\tエプソン\tエプソン\t名詞-固有名詞-組織\t\t\nの\tノ\tの\t助詞-連体化\t\t\nPX\tPX\tPX\t名詞-一般\t\t\n-\t-\t-\t記号-一般\t\t\nM\tM\tM\t名詞-一般\t\t\n5041\t5041\t5041\t名詞-数\t\t\nF\tF\tF\t名詞-一般\t\t\nは\tハ\tは\t助詞-係助詞\t\t\nとにかく\tトニカク\tとにかく\t副詞-一般\t\t\n紙\tカミ\t紙\t名詞-一般\t\t\n詰まり\tツマリ\t詰まり\t名詞-一般\t\t\nも\tモ\tも\t助詞-係助詞\t\t\nインク\tインク\tインク\t名詞-一般\t\t\nの\tノ\tの\t助詞-連体化\t\t\nかすれ\tカスレ\tかすれる\t動詞-自立\t一段\t連用形\nも\tモ\tも\t助詞-係助詞\t\t\nひどく\tヒドク\tひどい\t形容詞-自立\t形容詞・アウオ段\t連用テ接続\nて\tテ\tて\t助詞-接続助詞\t\t\n、\t、\t、\t記号-読点\t\t\n互換\tゴカン\t互換\t名詞-一般\t\t\n性\tセイ\t性\t名詞-接尾-一般\t\t\nインク\tインク\tインク\t名詞-一般\t\t\nも\tモ\tも\t助詞-係助詞\t\t\n悪かっ\tワルカッ\t悪い\t形容詞-自立\t形容詞・アウオ段\t連用タ接続\nた\tタ\tた\t助動詞\t特殊・タ\t基本形\nの\tノ\tの\t名詞-非自立-一般\t\t\nか\tカ\tか\t助詞-副助詞／並立助詞／終助詞\t\t\n、\t、\t、\t記号-読点\t\t\n最後\tサイゴ\t最後\t名詞-一般\t\t\nは\tハ\tは\t助詞-係助詞\t\t\nいくら\tイクラ\tい