In [40]:
# coding: utf-8
"""
楽天レビューでトピックを抽出するための学習ファイルの作成
UM用に1文ずつ形態素解析を行っていく
"""
import MeCab
import csv
import collections
import pickle
import numpy as np

def parsing(sentence):
    mecab = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    res = mecab.parseToNode(sentence)
    list_words = []
    while res:
        features = res.feature.split(",")
        if (features[0] == "名詞" and features[1] in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]) or features[0] == "形容詞":
            if features[6] == "*":
                list_words.append(res.surface)
            else:
                list_words.append(features[6])
        res = res.next
    return list_words

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rU")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def readtsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f, delimiter='\t')
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()

def writedump(arr, path):
    f = open(path, "w")
    pickle.dump(arr, f)
    f.close()

### コーパス作成
1. 空白行を削除

In [51]:
list_hiro = readcsv("./files/rakuten_corpus/annotation/ヒロ_after.csv")
list_tetsuo = readcsv("./files/rakuten_corpus/annotation/テツオ_after.csv")

# 同じユーザーの口コミを1文にまとめる
list_hiro_rev = []
for row in list_hiro:
    if row[5] != "":
        list_hiro_rev.append(row)
        
list_tetsuo_rev = []
for row in list_tetsuo:
    if row[5] != "":
        list_tetsuo_rev.append(row)

In [52]:
del list_hiro_rev[0]
del list_tetsuo_rev[0]

In [53]:
arr_hiro = np.array(list_hiro_rev)

In [54]:
arr_hiro_label = arr_hiro[:,6:13]
arr_hiro_num = arr_hiro[:,0]
arr_hiro_sentence = arr_hiro[:,5]

In [55]:
arr_hiro_label = [[1 if num != "" else 0 for num in row] for row in arr_hiro_label]

In [56]:
list_label = [[], [], [], [], [], [], []]
for i in range(7):
    for num, row, sentence in zip(arr_hiro_num, arr_hiro_label, arr_hiro_sentence):
        if np.sum(row) == 1 and row[i] == 1: 
            list_label[i].append([num, sentence, i])

In [57]:
arr_tetsuo = np.array(list_tetsuo_rev)
arr_tetsuo_label = arr_tetsuo[:,6:13]
arr_tetsuo_num = arr_tetsuo[:,0]
arr_tetsuo_sentence = arr_tetsuo[:,5]
arr_tetsuo_label = [[1 if num != "" else 0 for num in row] for row in arr_tetsuo_label]

In [58]:
for i in range(7):
    for num, row, sentence in zip(arr_tetsuo_num, arr_tetsuo_label, arr_tetsuo_sentence):
        if np.sum(row) == 1 and row[i] == 1: 
            list_label[i].append([num, sentence, i])

In [61]:
list_total = []
for row in list_label:
    list_total.extend(row)

In [62]:
writecsv(list_total, "./files/rakuten_corpus/annotation/hiro_tetsuo.csv")

In [63]:
# 形態素解析したリストを作成
list_words = [[row[0], parsing(row[1]), row[2]] for row in list_total]

In [64]:
writedump(list_words, "./files/rakuten_corpus/annotation/hiro_tetsuo_sep.dump")

### 空白でsepしてtxt形式で保存

In [7]:
writecsv(list_words_rev, "./files/rakuten_corpus/rakuten_corpus_for_UM.txt")