In [45]:
# coding: utf-8
"""
頻度により除去語を指定しないコーパスを使う
1文ずつ形態素解析したものと、口コミごとに形態素解析したものの2種類を作成
有向グラフ用のエッジリストも作成
"""
import MeCab
import csv
import collections
import pickle
import itertools
from library.filer import Filer
import glob
import random
import re
    
def parsing(sentence):
    mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    res = mecab.parseToNode(sentence)
    list_words = []
    while res:
        features = res.feature.split(",")
        if (features[0] == "名詞" and features[1] in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]) or features[0] == "形容詞":
            if features[6] == "*":
                if res.surface != '':
                    list_words.append(res.surface)
            else:
                if features[6] != '':
                    list_words.append(features[6])
        res = res.next
    return list_words

def parsing_rev(sentence):
    mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    res = mecab.parseToNode(sentence)
    list_words = []
    while res:
        features = res.feature.split(",")
        if features[0] != "記号":
            if features[6] == "*":
                if res.surface != '':
                    list_words.append(res.surface)
            else:
                if features[6] != '':
                    list_words.append(features[6])
        res = res.next
    return list_words

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def readtsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f, delimiter='\t')
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()

def writedump(arr, path):
    f = open(path, "w")
    pickle.dump(arr, f)
    f.close()

### コーパス作成
1. 同じユーザーの口コミを1文にまとめる
2. ユーザーごとに形態素解析したリストを作成
3. 高頻度、低頻度語を削除するために、除去語リストを作成する
4. 除去

In [None]:
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")
# 同じユーザーの口コミを1文にまとめる
list_sentences_rev = []
sentence_tmp = ""
for row in list_sentences:
    if row[5] != "":
        sentence_tmp += row[5]
    else:
        list_sentences_rev.append(sentence_tmp)
        sentence_tmp = ""
list_sentences_rev.append(sentence_tmp)


# 形態素解析したリストを作成
list_words = [parsing(row) for row in list_sentences_rev]



# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    list_words_collection.extend(row)

# 除去語を省く前の語彙数と語数
print len(list_words_collection)
print len(set(list_words_collection))
    
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp = []
    for word in row:
        if word in list_words_remove:
            pass
        else:
            list_words_tmp.append(word)
    list_words_rev.append(list_words_tmp)

### コーパスから有向エッジリストを作成し、保存

In [None]:
# エッジリストを作成
list_edgelist = []
for row in list_words:
    for j in range(len(row) - 1):
        list_edgelist.append([row[j], row[j+1]])

# 作成したエッジリストを保存
writecsv(list_edgelist, "./files/rakuten_corpus_edgelist_full.csv")

### UM用のコーパスを作成

In [None]:
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")

In [None]:
# 空白行を削除
list_sentences_rev = []
for row in list_sentences:
    if row[5] != '':
        list_sentences_rev.append(row[5])

In [None]:
list_words = [parsing(row) for row in list_sentences_rev]

In [None]:
writecsv(list_words, "./files/rakuten_corpus/rakuten_corpus_full_for_UM.csv")

### 共起グラフでのエッジリスト作成

In [None]:
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")

In [None]:
# 空白行を削除
list_sentences_rev = []
for row in list_sentences:
    if row[5] != '':
        list_sentences_rev.append(row[5])

In [None]:
list_words = [parsing(row) for row in list_sentences_rev]

In [None]:
# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    list_words_collection.extend(row)
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp = []
    for word in row:
        if word in list_words_remove:
            pass
        else:
            list_words_tmp.append(word)
    list_words_rev.append(list_words_tmp)

In [None]:
# 共起エッジリストの作成
list_edges = []
for row1 in list_words_rev:
    list_tmp = list(itertools.combinations(row1,2))
    for row2 in list_tmp:
        list_edges.append(list(row2))

In [None]:
writecsv(list_edges, "./files/rakuten_corpus_edgelist_co.csv")

### 共起グラフ+windowでのエッジリスト作成

In [None]:
# windowサイズ
window = 1
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")
# list_list_bgの形のデータを作る
list_sentences_rev = []
list_tmp = []
for row in list_sentences:
    if row[5] != '':
        list_tmp.append(row[5])
    else:
        list_sentences_rev.append(list_tmp)
        list_tmp = []
else:
    list_sentences_rev.append(list_tmp)

list_words = [[parsing(sentence) for sentence in row] for row in list_sentences_rev]

# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    for sentence in row:
        list_words_collection.extend(sentence)
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp1 = []
    for sentence in row:
        list_words_tmp2 = []
        for word in sentence:
            if word in list_words_remove:
                pass
            else:
                list_words_tmp2.append(word)
        list_words_tmp1.append(list_words_tmp2)
    list_words_rev.append(list_words_tmp1)

In [None]:
list_words_rev1 = [[num for word in row for num in word]for row in list_words_rev]

In [None]:
window = 3
list_edges = []
for row in list_words_rev1:
    for i in range(len(row) - window):
        list_tmp = list(itertools.combinations(row[i:i+window],2))
        for row1 in list_tmp:
            list_edges.append(row1)

In [None]:
writecsv(list_edges, "./files/rakuten_corpus/rakuten_corpus_edgelist_window3.csv")

In [None]:
list_words_rev2 = []
# windowのサイズに従ったbgを作り直す
for row in list_words_rev:
    len_num = len(row)
    for i, sentence in enumerate(row):
        list_tmp = []
        for num in range(i-window,i+window+1):
            if num < 0 or num >= len_num:
                continue
            else:
                list_tmp.extend(row[num])
        list_words_rev2.append(list_tmp)
        
# 共起エッジリストの作成
list_edges = []
for row1 in list_words_rev2:
    list_tmp = list(itertools.combinations(row1,2))
    for row2 in list_tmp:
        list_edges.append(list(row2))

In [None]:
writecsv(list_edges, "./files/rakuten_corpus/rakuten_corpus_edgelist_window1.csv")

### 共起グラフ+windowでのエッジリスト作成

In [None]:
# windowサイズ
window = 1
# weight
weight = 0.2
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")
# list_list_bgの形のデータを作る
list_sentences_rev = []
list_tmp = []
for row in list_sentences:
    if row[5] != '':
        list_tmp.append(row[5])
    else:
        list_sentences_rev.append(list_tmp)
        list_tmp = []
else:
    list_sentences_rev.append(list_tmp)

list_words = [[parsing(sentence) for sentence in row] for row in list_sentences_rev]

# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    for sentence in row:
        list_words_collection.extend(sentence)
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp1 = []
    for sentence in row:
        list_words_tmp2 = []
        for word in sentence:
            if word in list_words_remove:
                pass
            else:
                list_words_tmp2.append(word)
        list_words_tmp1.append(list_words_tmp2)
    list_words_rev.append(list_words_tmp1)

list_edges = []
# windowのサイズに従ったbgを作り直す
for row in list_words_rev:
    len_num = len(row)
    for i, sentence in enumerate(row):
        list_tmp = []
        for num in range(i-window,i+window+1):
            if num < 0 or num >= len_num:
                continue
            else:
                if num == i:
                    list_tmp = list(itertools.combinations(row[num],2))
                    list_edges.extend([[tmp[0], tmp[1], 1] for tmp in list_tmp])
                else:
                    list_tmp = list(itertools.product(row[num], row[i]))
                    list_edges.extend([[tmp[0], tmp[1], weight] for tmp in list_tmp])

dict_edges = {}
for row in list_edges:
    tuple_tmp = tuple(sorted([row[0], row[1]]))
    if tuple_tmp in dict_edges:
        dict_edges[tuple_tmp] += row[2]
    else:
        dict_edges[tuple_tmp] = row[2]

list_master = [[key[0], key[1], value] for key, value in dict_edges.items()]

In [None]:
writecsv(list_master, "./files/rakuten_corpus/rakuten_corpus_edgelist_window_1_0.2.csv")

### 楽天トラベルのテキストファイルを読み込んで、コーパスサイズごとに分ける

In [None]:
list_master = []
list_filename = glob.glob("./files/rakuten_corpus/rakuten_corpus_master/txtfile/*.txt")
for path in list_filename:
    list_master.extend(Filer.readtxt(path))
    
list_master_rev = [row.split('\t')[2] for row in list_master]
list_master_rev2 = [row.split('【ご利用の宿泊プラン】')[0] for row in list_master_rev]

list_num = [i* 1000 for i in range(1,10)]
list_num.extend([i* 10000 for i in range(1,10)])
list_num.extend([i* 100000 for i in range(1,10)])
list_num.extend([i* 1000000 for i in range(1,4)])

list_review = []
for num in list_num:
    list_review.append(random.sample(list_master_rev2, num))

In [None]:
for row in list_review:
    Filer.writetxt(row, 'files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_review_%s.txt' % len(row))

### ファイルごとに形態素解析（removeなし）

In [None]:
dict_master = {}
list_filename = glob.glob("./files/rakuten_corpus/rakuten_corpus_master/sentencefile/*.txt")

In [None]:
for path in list_filename:
    list_tmp = Filer.readtxt(path)
    dict_master[len(list_tmp)] = list_tmp

In [None]:
list_num = [i* 1000 for i in range(1,10)]
list_num.extend([i* 10000 for i in range(1,10)])
list_num.extend([i* 100000 for i in range(1,10)])
list_num.extend([i* 1000000 for i in range(1,4)])

In [None]:
pattern = re.compile(r'。|！|？|!|\?')
dict_master_rev = {}
for num in list_num:
    dict_master_rev[num] = []
    for user in dict_master[num]:
        for sentence in re.split(pattern, user):
            if len(sentence.decode('utf-8')) > 1 and sentence != '　' and sentence != ' ':
                dict_master_rev[num].append(sentence)

In [None]:
for num in list_num:
    Filer.writetxt(dict_master_rev[num], 'files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_%s.txt' % num)

In [None]:
dict_words = {}
for num in list_num:
    dict_words[num] = []
    for sentence in dict_master_rev[num]:
        list_tmp = parsing(sentence)
        if len(list_tmp) != 0:
            dict_words[num].append(" ".join(list_tmp))
    Filer.writetxt(dict_words[num], "files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_%s.txt" % num)

### 学習コーパス（type1）を作る
* 頻度２以下の単語を削除
* 単語数が１以下の文を削除
* ただしユーザ数が100000以下のファイルのみ（計算が終わらないから）

In [14]:
list_filename = ["./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_1000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_2000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_3000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_4000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_5000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_6000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_7000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_8000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_9000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_10000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_20000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_30000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_40000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_50000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_60000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_70000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_80000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_90000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_100000.txt"]
list_testfile = Filer.readdump('files/rakuten_corpus/annotation/all_sep_full.dump')

In [15]:
for path in list_filename:
    dict_word_freq = {}
    list_remove = []
    print path
    list_word = Filer.readtxt(path)
    list_word = [row.split(' ') for row in list_word]
    # testファイルの形態素を合体
    for row in list_testfile:
        list_word.append(row[2])
    for row in list_word:
        for word in row:
            if word in dict_word_freq:
                dict_word_freq[word] += 1
            else:
                dict_word_freq[word] = 1
    for word, freq in dict_word_freq.items():
        if freq <= 2:
            list_remove.append(word)
    
    list_word = [[word for word in row if word not in list_remove] for row in list_word]
    list_word = [" ".join(row) for row in list_word if len(row) >= 2]
    Filer.writetxt(list_word, 'files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type1/forUM/preprocessed_%s.txt' % len(list_word))

./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_1000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_2000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_3000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_4000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_5000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_6000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_7000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_8000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_9000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_10000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_20000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_30000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/raku

### 学習コーパス（type1）のPRTM用を作る

In [24]:
list_filepath = glob.glob('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type1/forUM/*.txt')

In [25]:
for path in list_filepath:
    list_sepword = Filer.readtxt(path)
    list_sepword = [row.split(' ') for row in list_sepword]
    list_edgelist = []
    for row in list_sepword:
        list_tmp = list(itertools.combinations(row,2))
        list_edgelist.extend(list_tmp)
    list_edgelist = [' '.join(row) for row in list_edgelist]
    path_rev = path.replace('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type1/forUM/preprocessed', '')
    Filer.writetxt(list_edgelist, './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type1/forPRTM/rakuten_edgelist' + path_rev)

### 学習コーパス（type2）を作る
* 頻度２以下の単語を削除
* 出現頻度6%以上の単語を削除
* 単語数が１以下の文を削除
* ただしユーザ数が100000以下のファイルのみ（計算が終わらないから）

In [26]:
list_filename = ["./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_1000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_2000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_3000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_4000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_5000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_6000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_7000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_8000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_9000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_10000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_20000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_30000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_40000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_50000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_60000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_70000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_80000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_90000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_100000.txt"]
list_testfile = Filer.readdump('files/rakuten_corpus/annotation/all_sep_full.dump')

In [None]:
for path in list_filename:
    dict_word_freq = {}
    dict_document_word = {}
    list_remove = []
    print path
    list_word = Filer.readtxt(path)
    list_word = [row.split(' ') for row in list_word]
    # testファイルの形態素を合体
    for row in list_testfile:
        list_word.append(row[2])
    for row in list_word:
        for word in row:
            if word in dict_word_freq:
                dict_word_freq[word] += 1
            else:
                dict_word_freq[word] = 1
        for word in set(row):
            if word in dict_document_word:
                dict_document_word[word] += 1
            else:
                dict_document_word[word] = 1
    dict_document_word = {word: float(freq)/len(list_word) for word, freq in dict_document_word.items()}

    for word, freq in dict_word_freq.items():
        if freq <= 2:
            list_remove.append(word)
    for word, freq in dict_document_word.items():
        if freq >= 0.06:
            list_remove.append(word)
    
    list_word = [[word for word in row if word not in list_remove] for row in list_word]
    list_word = [" ".join(row) for row in list_word if len(row) >= 2]
    Filer.writetxt(list_word, 'files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_%s.txt' % len(list_word))

./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_1000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_2000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_3000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_4000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_5000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_6000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_7000.txt
./files/rakuten_corpus/rakuten_corpus_master/sepwordfile/rakuten_sep_8000.txt

### 学習コーパス（type2）のPRTM用を作る

In [36]:
list_filepath = glob.glob('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/*.txt')

In [38]:
list_filepath

['./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_34992.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_336795.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_302308.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_31571.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_28738.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_134672.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_24506.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_11384.txt',
 './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed_14724.txt',
 './files/rakute

In [39]:
for path in list_filepath:
    list_sepword = Filer.readtxt(path)
    list_sepword = [row.split(' ') for row in list_sepword]
    list_edgelist = []
    for row in list_sepword:
        list_tmp = list(itertools.combinations(row,2))
        list_edgelist.extend(list_tmp)
    list_edgelist = [' '.join(row) for row in list_edgelist]
    path_rev = path.replace('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed', '')
    Filer.writetxt(list_edgelist, './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forPRTM/rakuten_edgelist' + path_rev)

### PRTM用のコーパスを作成する(type2, bigram)

In [41]:
list_filepath = glob.glob('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/*.txt')

In [42]:
for path in list_filepath:
    list_sepword = Filer.readtxt(path)
    list_sepword = [row.split(' ') for row in list_sepword]
    list_edgelist = []
    for row in list_sepword:
        list_tmp = []
        for i in range(len(row)-1):
            list_tmp.append([row[i], row[i+1]])
        list_edgelist.extend(list_tmp)
    list_edgelist = [' '.join(row) for row in list_edgelist]
    path_rev = path.replace('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forUM/rakuten_preprocessed', '')
    Filer.writetxt(list_edgelist, './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type2/forPRTM_bigram/rakuten_bigram' + path_rev)

### テスト用コーパスを作る

In [16]:
list_testfile = Filer.readdump('files/rakuten_corpus/annotation/all_sep_full.dump')

In [22]:
list_testfile = [[row[2], row[1]] for row in list_testfile]
Filer.writedump(list_testfile, './files/rakuten_corpus/rakuten_corpus_master/testfile/list_sepword_label.dump')

### 学習用コーパス(type3)を作成する

In [43]:
list_filename = ["./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_1000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_2000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_3000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_4000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_5000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_6000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_7000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_8000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_9000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_10000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_20000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_30000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_40000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_50000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_60000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_70000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_80000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_90000.txt",
                 "./files/rakuten_corpus/rakuten_corpus_master/sentencefile/rakuten_sentence_100000.txt"]

In [50]:
for path in list_filepath:
    list_sentence = Filer.readtxt(path)
    list_word = [parsing_rev(sentence) for sentence in list_sentence]
    list_word = [" ".join(row) for row in list_word if len(row) >= 2]
    Filer.writetxt(list_word, "files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type3/forUM/rakuten_preprocessed_%s.txt" % len(list_word))

### テストコーパス作り(type3)

In [77]:
list_sentencefile = Filer.readtsv('./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv')
list_anno = Filer.readdump('./files/rakuten_corpus/annotation/all_sep.dump')

dict_id_sentence = {int(row[0]): row[5] for row in list_sentencefile if row[0] != ''}
list_id_labels = [[row[0], row[1]] for row in list_anno]

list_sentence_label = [[dict_id_sentence[row[0]], row[1]] for row in list_id_labels]

list_sepword_label = [[parsing_rev(row[0]), row[1]] for row in list_sentence_label]

Filer.writedump(list_sepword_label, './files/rakuten_corpus/rakuten_corpus_master/testfile/list_sepword_label_type3.dump')

### PRTM用のコーパス作り(type3, bigram)

In [109]:
list_filepath = glob.glob('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type3/forUM/*.txt')

In [110]:
for path in list_filepath:
    list_sepword = Filer.readtxt(path)
    list_sepword = [row.split(' ') for row in list_sepword]
    list_edgelist = []
    for row in list_sepword:
        list_tmp = []
        for i in range(len(row)-1):
            list_tmp.append([row[i], row[i+1]])
        list_edgelist.extend(list_tmp)
    list_edgelist = [' '.join(row) for row in list_edgelist]
    path_rev = path.replace('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type3/forUM/rakuten_preprocessed', '')
    Filer.writetxt(list_edgelist, './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type3/forPRTM_bigram/rakuten_bigram' + path_rev)

### PRTM用のコーパス作り(type3, cor)

In [None]:
list_filepath = glob.glob('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type3/forUM/*.txt')

In [None]:
for path in list_filepath:
    list_sepword = Filer.readtxt(path)
    list_sepword = [row.split(' ') for row in list_sepword]
    list_edgelist = []
    for row in list_sepword:
        list_tmp = list(itertools.combinations(row,2))
        list_edgelist.extend(list_tmp)
    list_edgelist = [' '.join(row) for row in list_edgelist]
    path_rev = path.replace('./files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type3/forUM/rakuten_preprocessed', '')
    Filer.writetxt(list_edgelist, './files/rakuten_corpus/rakuten_corpus_master/preprocessedfile/type3/forPRTM_cor/rakuten_edgelist' + path_rev)