In [2]:
!pip install mecab-python3
!pip install unidic-lite
!pip install ipadic

Collecting mecab-python3
  Downloading mecab_python3-1.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading mecab_python3-1.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (581 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.7/581.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.9
Collecting unidic-lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658818 sha256=6b9e9014c600bf828a0c09dcff1c0da66e91aebcbee51f86802a425a48d106f2
  Stored in direct

In [4]:
import numpy as np
import pandas as pd
import MeCab
import ipadic
import csv
from tqdm import tqdm
import os
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [12]:
os.listdir("/content/drive/MyDrive/data/")

['unzip', 'sample']

In [16]:
sample_path = "/content/drive/MyDrive/data/sample"
unzip_path = "/content/drive/MyDrive/data/unzip"

### Generate Wakati reviview

In [None]:
wakati = MeCab.Tagger("-Owakati")
data_sub = pd.read_csv(sample_path)['レビュー内容']
text = data_sub.to_numpy()
with open("wakati_review.txt", "w") as fp:
    for line in tqdm(text):
        fp.write(wakati.parse(line))

### Randomly print a review

In [15]:
CHASEN_ARGS = r' -F "%m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n"'
CHASEN_ARGS += r' -U "%m\t%m\t%m\t%F-[0,1,2,3]\t\t\n"'

wakati = MeCab.Tagger(ipadic.MECAB_ARGS + CHASEN_ARGS)


data = pd.read_csv(sample_path)
data_sub = data['レビュー内容'].to_numpy()
select = np.random.choice(data_sub)
results = wakati.parse(select)
print(results)



IsADirectoryError: [Errno 21] Is a directory: '/content/drive/MyDrive/data/sample'

### Sampler class


In [20]:
class Sampler:

    def __init__(self,path,data_saved, sample_size):
        self.path = path #path to raw
        self.data_saved = data_saved
        if not os.path.isdir(self.data_saved):
            os.mkdir(self.data_saved)
        self.encoded_out_dir = os.path.join(self.data_saved,"file_encoded.txt")
        self.sample_index_outdir = os.path.join(self.data_saved,"file_sample.txt")
        self.sample_csv_out_dir = os.path.join(self.data_saved,"sample_from_raw.csv")
        self.sample_size = sample_size

    def file_encode(self):
        """
        读取所有的评论,将index和参考になった数存下来保存到out_name文件下"""

        self.file_list = sorted(os.listdir(self.path)) #列出所以文件
        with open(self.encoded_out_dir, "w") as file: #file_encoded.txt
            for file_name in tqdm(self.file_list):
                data = pd.read_csv(os.path.join(self.path,file_name),sep='\t',quoting=csv.QUOTE_NONE)
                col = ['投稿者ID', '店舗名', '店舗ID', '商品名', '商品ID', '商品ページURL', '商品ジャンルID', '商品ジャンルIDパス', '使い道',
                        '目的', '頻度', '評価ポイント', 'レビュータイトル', 'レビュー内容', '参考になった数', 'レビュー登録日時']
                data.columns = col
                data.reset_index(inplace=True) #add index column
                data = data[["index","参考になった数"]].to_numpy()
                for line in range(data.shape[0]):
                    content = file_name + " " + " ".join(str(i) for i in data[line,:].tolist())
                    file.write(content + "\n")

    def sample_file(self):
        path_to_encoded = self.encoded_out_dir
        path_to_sample = self.sample_index_outdir
        if not os.path.exists(path_to_encoded):
            self.file_encode()
        else:
            print("encoded exist")
            with open(path_to_encoded,"r") as file:
                encoded = file.readlines()

            sample = np.random.choice(range(len(encoded)),size=self.sample_size, replace=False)
            encoded_sample = [encoded[i] for i in sample] #slicing wit list of index
        with open(path_to_sample,"w") as file:
            for line in encoded_sample:
                file.write(line)

    def entry_from_row(self):
        path_to_sample = self.sample_index_outdir
        if not os.path.exists(path_to_sample):
            self.sample_file()
        else:
            print("sample file exist")
            with open(path_to_sample, "r") as file:
                sample = {}
                for line in file:
                    line = line.replace("\n", "").split(" ")
                    line_index = sample.setdefault(line[0],[]) #key is file name, value is list of index
                    line_index.append(line[1])
            frames = []
            print("start to sampling")
            for key in tqdm(sample.keys()):
                path = os.path.join(self.path,key)
                df = pd.read_csv(path,sep='\t',quoting=csv.QUOTE_NONE)
                col = ['投稿者ID', '店舗名', '店舗ID', '商品名', '商品ID', '商品ページURL', '商品ジャンルID', '商品ジャンルIDパス', '使い道',
                        '目的', '頻度', '評価ポイント', 'レビュータイトル', 'レビュー内容', '参考になった数', 'レビュー登録日時']
                df.columns = col
                df.reset_index(inplace=True)
                idxs = [int(idx) for idx in sample[key]]
                frame = df.iloc[idxs,:].copy()
                frames.append(frame)
            result = pd.concat(frames)
            result.reset_index(inplace=True)
            result.to_csv(self.sample_csv_out_dir)

### Extract Hinshi

In [None]:
def extract_hinshi(text, hinshi:str):
    """
    品词, 抽出所有文本对应的品词 text 为pandas review dataframe to numpy。 返回text中所以对应的hinshi
    """
    wakati = MeCab.Tagger(ipadic.MECAB_ARGS + CHASEN_ARGS)
    kigo = set()
    CHASEN_ARGS = r' -F "%m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n"'
    CHASEN_ARGS += r' -U "%m\t%m\t%m\t%F-[0,1,2,3]\t\t\n"'

    for line in text:
        for token in wakati_text:
            token = token.split("\t")
            if token[0] != "" and token[0] != "EOS":
                if hinshi in token[3]:
                    kigo.add(token[0])
    return kigo




### main

In [18]:
sampler = Sampler(path=unzip_path, data_saved= sample_path, sample_size=50000)
sampler