In [1]:
# coding: utf-8
"""
ヒロ，てつお，一平、よっさんのアノテーションを元に評価用のコーパスの作成
"""
import MeCab
import csv
import collections
import pickle
import numpy as np

def parsing(sentence):
    mecab = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    res = mecab.parseToNode(sentence)
    list_words = []
    while res:
        features = res.feature.split(",")
        if (features[0] == "名詞" and features[1] in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]) or features[0] == "形容詞":
            if features[6] == "*":
                list_words.append(res.surface)
            else:
                list_words.append(features[6])
        res = res.next
    return list_words

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rU")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def readtsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f, delimiter='\t')
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()

def writedump(arr, path):
    f = open(path, "w")
    pickle.dump(arr, f)
    f.close()

def readdump(path):
    f = open(path, "r")
    l = pickle.load(f)
    f.close()
    return l

### コーパス作成

In [2]:
# コーパスの読み込み
list_f1 = readcsv("./files/rakuten_corpus/annotation/ヒロ_after.csv")
list_l1 = readcsv("./files/rakuten_corpus/annotation/テツオ_after.csv")
list_f2 = readcsv("./files/rakuten_corpus/annotation/一平_after.csv")
list_l2 = readcsv("./files/rakuten_corpus/annotation/よっさん_after.csv")

In [3]:
# 空白行の削除
list_f1_rev = []
for row in list_f1:
    if row[5] != "":
        list_f1_rev.append(row)

list_f2_rev = []
for row in list_f2:
    if row[5] != "":
        list_f2_rev.append(row)
        
list_l1_rev = []
for row in list_l1:
    if row[5] != "":
        list_l1_rev.append(row)
        
list_l2_rev = []
for row in list_l2:
    if row[5] != "":
        list_l2_rev.append(row)
        
# １行目の削除
del list_l1_rev[0]
del list_l2_rev[0]
del list_f1_rev[0]
del list_f2_rev[0]

In [4]:
# アノテーションの部分のみ抽出
arr_f1_label = np.array(list_f1_rev)[:,6:13]
arr_f2_label = np.array(list_f2_rev)[:,6:13]
arr_l1_label = np.array(list_l1_rev)[:,6:13]
arr_l2_label = np.array(list_l2_rev)[:,6:13]

In [5]:
# 空白行を0に, int型に変換
arr_f1_label = [[1 if num != "" else 0 for num in row] for row in arr_f1_label]
arr_f2_label = [[1 if num != "" else 0 for num in row] for row in arr_f2_label]
arr_l1_label = [[1 if num != "" else 0 for num in row] for row in arr_l1_label]
arr_l2_label = [[1 if num != "" else 0 for num in row] for row in arr_l2_label]

In [6]:
# 二人の評価が同じ文のみを抽出
n = 7  # クラスタ数
list_former = []
for i in range(n):
    for row, label1, label2 in zip(list_f1_rev, arr_f1_label, arr_f2_label):
        if np.sum(label1) == 1 and np.sum(label2) == 1 and label1[i] == 1 and label1[i] == 1:
            list_former.append([int(row[0]), i, row[5]])

list_latter = []
for i in range(n):
    for row, label1, label2 in zip(list_l1_rev, arr_l1_label, arr_l2_label):
        if np.sum(label1) == 1 and np.sum(label2) == 1 and label1[i] == 1 and label1[i] == 1:
            list_latter.append([int(row[0]), i, row[5]])

In [7]:
# 文番号で並び替え
list_former = sorted(list_former, key=lambda x: x[0])
list_latter = sorted(list_latter, key=lambda x: x[0])

In [8]:
# 前半後半の合体
list_all = []
list_all.extend(list_former)
list_all.extend(list_latter)

In [9]:
# 削除語リストの読み込み
list_remove_words = readdump("./files/rakuten_corpus/list_rakuten_remove_word.dump")

In [10]:
# 形態素解析
list_all_sep = [[row[0], row[1], parsing(row[2])] for row in list_all]

In [11]:
# 削除語リストに登録されている単語を削除し，単語が空じゃない行のみを抜き出す
list_all_rev = []
for row in list_all_sep:
    list_tmp = []
    for word in row[2]:
        if word not in list_remove_words:
            list_tmp.append(word)
    if len(list_tmp) != 0:
        list_all_rev.append([row[0], row[1], list_tmp])

In [13]:
writedump(list_all_rev, "./files/rakuten_corpus/annotation/all_sep_others.dump")