In [101]:
# coding: utf-8
"""
英語コーパスを編集するためのコード
"""
from library.filer import Filer
import re
import glob
import random
from stanford_corenlp_pywrapper import CoreNLP
import collections
import itertools

1. Fileのパスを読み込み
2. Edmundsの2009年分のデータセットを全て読み込む(約150車種分)

In [3]:
list_path = glob.glob("../../../大学院/データセット/OpinRankDatasetWithJudgments/cars/data/2009/*")

list_all = []
for path in list_path:
    list_all.extend(Filer.readtxt(path))

1. 正規表現で<TEXT>の部分だけ読み込む
2. 先頭の<TEXT>、文末の</TEXT>を削除する
3. txt形式で保存する（約7000ユーザー分）

In [4]:
pattern = r'^<TEXT>'
list_all_rev = []
for row in list_all:
    if re.match(pattern, row):
        list_all_rev.append(row)
        
pattern1 = r'^<TEXT>'
pattern2 = r'</TEXT>'
list_all_rev1 = []
for row in list_all_rev:
    str1 = re.sub(pattern1, "", row)
    str2 = re.sub(pattern2, "", str1)
    list_all_rev1.append(str2)
    
Filer.writetxt(list_all_rev1, "../../../大学院/データセット/OpinRankDatasetWithJudgments/cars_all_review_2009.txt")

1. リストをシャッフルする
2. 1000ユーザー分だけを別のファイルに保存する

In [16]:
random.shuffle(list_all_rev1)

Filer.writetxt(list_all_rev1[0:1000], "../../../大学院/データセット/OpinRankDatasetWithJudgments/cars_all_review_2009_random1000.txt")

1. 1000ユーザー分のコーパスを読み込み
2. カンマで分割
3. 改行コードを削除する
4. tsvファイルにして保存

In [19]:
list_user = Filer.readtxt("../../../大学院/データセット/OpinRankDatasetWithJudgments/cars_all_review_2009_random1000.txt")

list_user_rev = []
for row in list_user:
    list_user_rev.append(row.split(". "))
    
list_user_rev1 = []
pattern = r"\r\n"
for row in list_user_rev:
    list_tmp = []
    for sentence in row:
        sentence1 = re.sub(pattern, "", sentence)
        if sentence1 != "":
            list_tmp.append(sentence1)
    list_user_rev1.append(list_tmp)
    
Filer.writetsv(list_user_rev1, "../../../大学院/データセット/OpinRankDatasetWithJudgments/cars_all_review_2009_random1000_sep.tsv")

### edmunds 評価用コーパスを抜き出す

1. 4人分のアノテーションを読み込む
2. エラーデータがないか確認
3. カンマで分割
4. 改行コードを削除する
5. tsvファイルにして保存

In [42]:
# 4人分のアノテーションを読み込む
list_csv_f1 = Filer.readcsv("./files/edmunds/annotation/edmunds_annotation_former_flavia.csv", option="rU")
list_csv_f2 = Filer.readcsv("./files/edmunds/annotation/edmunds_annotation_former_zhang.csv", option="rU")
list_csv_l1 = Filer.readcsv("./files/edmunds/annotation/edmunds_annotation_latter_danilo.csv", option="rU")
list_csv_l2 = Filer.readcsv("./files/edmunds/annotation/edmunds_annotation_latter_toshio.csv", option="rU")

# どのデータも破損していないか、確認
def check(list_csv, num=12):
    for i, row in enumerate(list_csv):
        if len(row) != num:
            print "Error:", i

check(list_csv_f1)
check(list_csv_f2)
check(list_csv_l1)
check(list_csv_l2)
        
# 文番号をintに、空白に0を"1"をintに変換する
def convert(list_csv):
    for i, row in enumerate(list_csv):
        list_csv[i][0] = int(list_csv[i][0])
        list_csv[i][1] = int(list_csv[i][1])
        for j in range(3,12):
            if list_csv[i][j] == "":
                list_csv[i][j] = 0
            else:
                list_csv[i][j] = 1
    return list_csv

list_csv_f1 = convert(list_csv_f1)
list_csv_f2 = convert(list_csv_f2)
list_csv_l1 = convert(list_csv_l1)
list_csv_l2 = convert(list_csv_l2)

# 2人の評価が同じものをデータセットとする
def match(list_csv1, list_csv2):
    list_csv_master = []
    for row1, row2 in zip(list_csv1, list_csv2):
        for i in range(3,11):
            if sum(row1[3:]) == 1 and sum(row2[3:]) == 1:
                if row1[i] == 1 and row2[i] == 1:
                    list_tmp = row1[:3]
                    list_tmp.append(i-3)
                    list_csv_master.append(list_tmp)
    return list_csv_master

list_csv_former = match(list_csv_f1, list_csv_f2)
list_csv_latter = match(list_csv_l1, list_csv_l2)
# 文番号でソート
list_csv_former = sorted(list_csv_former, key=lambda x:x[1])
list_csv_latter = sorted(list_csv_latter, key=lambda x:x[1])
# 1つのリストにまとめる
list_master = []
list_master.extend(list_csv_former)
list_master.extend(list_csv_latter)

In [52]:
Filer.writetsv(list_master, "./files/edmunds/list_sentence_annotation.tsv")

### Edmunds car reviewの語彙数と単語数を確認

In [7]:
list_sentence_sep = Filer.readtsv("./files/edmunds/list_sentence_sep.tsv")
list_word = []
for row in list_sentence_sep:
    list_word.extend(row[3:])
print len(list_sentence_sep)
print len(list_word)
print len(set(list_word))

3933
18680
1346


### コーパス数を変えたデータセットを作るためのコード

In [5]:
list_path = glob.glob("files/edmunds/edmunds_corpus_master/2007/*")
list_path.extend(glob.glob("files/edmunds/edmunds_corpus_master/2008/*"))
list_path.extend(glob.glob("files/edmunds/edmunds_corpus_master/2009/*"))
list_all = []
for path in list_path:
    list_all.extend(Filer.readtxt(path))

In [19]:
pattern = r'^<TEXT>'
list_all_rev = []
for row in list_all:
    if re.match(pattern, row):
        list_all_rev.append(row)
        
pattern1 = r'^<TEXT>'
pattern2 = r'</TEXT>'
list_all_rev1 = []
for row in list_all_rev:
    str1 = re.sub(pattern1, "", row)
    str2 = re.sub(pattern2, "", str1)
    list_all_rev1.append(str2.replace("\r\n", ""))

In [20]:
list_num = [i*1000 for i in range(1,10)]
list_num.extend([i*10000 for i in range(1,5)])

dict_sentence = {}
for num in list_num:
    dict_sentence[num] = random.sample(list_all_rev1, num)

In [24]:
for num in list_num:
    Filer.writetxt(dict_sentence[num], "files/edmunds/edmunds_corpus_master/userfile/edmunds_user_%s.txt" % num)

In [89]:
pattern = re.compile('\. |! |\? ')
dict_sentence_rev = {}
for num in list_num:
    dict_sentence_rev[num] = []
    for review in dict_sentence[num]:
        for sentence in re.split(pattern, review):
            if len(sentence) > 1:
                 dict_sentence_rev[num].append(sentence)

In [90]:
for num in list_num:
    Filer.writetxt(dict_sentence_rev[num], "files/edmunds/edmunds_corpus_master/sentencefile/edmunds_sentence_%s.txt" % num)

### テストデータセットの作成

In [103]:
list_testfile = Filer.readtsv("./files/edmunds/list_sentence_annotation.tsv")
proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit,pos, lemma'}, corenlp_jars=["/home/ikegami/lib/stanford-corenlp-full-2015-04-20/*"])

INFO:CoreNLP_PyWrapper:Starting java subprocess, and waiting for signal it's ready, with command: exec java -Xmx4g -XX:ParallelGCThreads=1 -cp '/home/ikegami/ikegami/lib/python2.7/site-packages/stanford_corenlp_pywrapper/lib/*:/home/ikegami/lib/stanford-corenlp-full-2015-04-20/*'      corenlp.SocketServer --outpipe /tmp/corenlp_pywrap_pipe_pypid=23656_time=1466060720.93  --configdict '{"annotators": "tokenize,ssplit,pos, lemma"}'
INFO:CoreNLP_PyWrapper:Successful ping. The server has started.
INFO:CoreNLP_PyWrapper:Subprocess is ready.


In [105]:
list_sepword = []
for _, _, sentence, label in list_testfile:
    pattern1 = r'^NN'
    pattern2 = r'^VB'
    pattern3 = r'^JJ'
    try:
        dict_tmp = proc.parse_doc(sentence)
        list_sepword_tmp = []
        for lemma, pos in zip(dict_tmp[u'sentences'][0][u'lemmas'], dict_tmp[u'sentences'][0][u'pos']):
            if re.match(pattern1, pos) or re.match(pattern2, pos) or re.match(pattern3, pos):
                list_sepword_tmp.append(lemma.encode('utf-8'))
        list_sepword.append([list_sepword_tmp, label])
    except UnicodeError:
        print "error"

In [114]:
Filer.writedump(list_sepword, './files/edmunds/edmunds_corpus_master/testfile/list_sepword_label.dump')

### テストデータセットの作成(削除語なし)

In [None]:
list_testfile = Filer.readtsv("./files/edmunds/list_sentence_annotation.tsv")
proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit,pos, lemma'}, corenlp_jars=["/home/ikegami/lib/stanford-corenlp-full-2015-04-20/*"])

In [None]:
list_sepword = []
for _, _, sentence, label in list_testfile:
    try:
        dict_tmp = proc.parse_doc(sentence)
        list_sepword_tmp = []
        for lemma, pos in zip(dict_tmp[u'sentences'][0][u'lemmas'], dict_tmp[u'sentences'][0][u'pos']):
            if lemma.encode('utf-8') != ',' and lemma.encode('utf-8') != '.':
                list_sepword_tmp.append(lemma.encode('utf-8'))
        list_sepword.append([list_sepword_tmp, label])
    except UnicodeError:
        print "error"

In [None]:
Filer.writedump(list_sepword, './files/edmunds/edmunds_corpus_master/testfile/list_sepword_label_type3.dump')