In [24]:
# coding: utf-8
"""
頻度により除去語を指定しないコーパスを使う
1文ずつ形態素解析したものと、口コミごとに形態素解析したものの2種類を作成
有向グラフ用のエッジリストも作成
"""
import MeCab
import csv
import collections
import pickle
import itertools

def parsing(sentence):
    mecab = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    res = mecab.parseToNode(sentence)
    list_words = []
    while res:
        features = res.feature.split(",")
        if (features[0] == "名詞" and features[1] in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]) or features[0] == "形容詞":
            if features[6] == "*":
                list_words.append(res.surface)
            else:
                list_words.append(features[6])
        res = res.next
    return list_words

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def readtsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f, delimiter='\t')
    arr = [row for row in dataReader]
    return arr

def writecsv(arr, path):
    f = open(path, "ab")
    dataWriter = csv.writer(f)
    dataWriter.writerows(arr)
    f.close()

def writedump(arr, path):
    f = open(path, "w")
    pickle.dump(arr, f)
    f.close()

### コーパス作成
1. 同じユーザーの口コミを1文にまとめる
2. ユーザーごとに形態素解析したリストを作成
3. 高頻度、低頻度語を削除するために、除去語リストを作成する
4. 除去

In [25]:
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")
# 同じユーザーの口コミを1文にまとめる
list_sentences_rev = []
sentence_tmp = ""
for row in list_sentences:
    if row[5] != "":
        sentence_tmp += row[5]
    else:
        list_sentences_rev.append(sentence_tmp)
        sentence_tmp = ""
list_sentences_rev.append(sentence_tmp)


# 形態素解析したリストを作成
list_words = [parsing(row) for row in list_sentences_rev]



# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    list_words_collection.extend(row)

# 除去語を省く前の語彙数と語数
print len(list_words_collection)
print len(set(list_words_collection))
    
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp = []
    for word in row:
        if word in list_words_remove:
            pass
        else:
            list_words_tmp.append(word)
    list_words_rev.append(list_words_tmp)

19401
3780


### コーパスから有向エッジリストを作成し、保存

In [3]:
# エッジリストを作成
list_edgelist = []
for row in list_words:
    for j in range(len(row) - 1):
        list_edgelist.append([row[j], row[j+1]])

# 作成したエッジリストを保存
writecsv(list_edgelist, "./files/rakuten_corpus_edgelist_full.csv")

### UM用のコーパスを作成

In [6]:
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")

In [23]:
# 空白行を削除
list_sentences_rev = []
for row in list_sentences:
    if row[5] != '':
        list_sentences_rev.append(row[5])

In [25]:
list_words = [parsing(row) for row in list_sentences_rev]

In [26]:
writecsv(list_words, "./files/rakuten_corpus/rakuten_corpus_full_for_UM.csv")

### 共起グラフでのエッジリスト作成

In [2]:
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")

In [3]:
# 空白行を削除
list_sentences_rev = []
for row in list_sentences:
    if row[5] != '':
        list_sentences_rev.append(row[5])

In [5]:
list_words = [parsing(row) for row in list_sentences_rev]

In [6]:
# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    list_words_collection.extend(row)
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp = []
    for word in row:
        if word in list_words_remove:
            pass
        else:
            list_words_tmp.append(word)
    list_words_rev.append(list_words_tmp)

In [14]:
# 共起エッジリストの作成
list_edges = []
for row1 in list_words_rev:
    list_tmp = list(itertools.combinations(row1,2))
    for row2 in list_tmp:
        list_edges.append(list(row2))

In [17]:
writecsv(list_edges, "./files/rakuten_corpus_edgelist_co.csv")

### 共起グラフ+windowでのエッジリスト作成

In [3]:
# windowサイズ
window = 1
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")
# list_list_bgの形のデータを作る
list_sentences_rev = []
list_tmp = []
for row in list_sentences:
    if row[5] != '':
        list_tmp.append(row[5])
    else:
        list_sentences_rev.append(list_tmp)
        list_tmp = []
else:
    list_sentences_rev.append(list_tmp)

list_words = [[parsing(sentence) for sentence in row] for row in list_sentences_rev]

# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    for sentence in row:
        list_words_collection.extend(sentence)
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp1 = []
    for sentence in row:
        list_words_tmp2 = []
        for word in sentence:
            if word in list_words_remove:
                pass
            else:
                list_words_tmp2.append(word)
        list_words_tmp1.append(list_words_tmp2)
    list_words_rev.append(list_words_tmp1)

In [10]:
list_words_rev1 = [[num for word in row for num in word]for row in list_words_rev]

In [22]:
window = 3
list_edges = []
for row in list_words_rev1:
    for i in range(len(row) - window):
        list_tmp = list(itertools.combinations(row[i:i+window],2))
        for row1 in list_tmp:
            list_edges.append(row1)

In [23]:
writecsv(list_edges, "./files/rakuten_corpus/rakuten_corpus_edgelist_window3.csv")

In [9]:
list_words_rev2 = []
# windowのサイズに従ったbgを作り直す
for row in list_words_rev:
    len_num = len(row)
    for i, sentence in enumerate(row):
        list_tmp = []
        for num in range(i-window,i+window+1):
            if num < 0 or num >= len_num:
                continue
            else:
                list_tmp.extend(row[num])
        list_words_rev2.append(list_tmp)
        
# 共起エッジリストの作成
list_edges = []
for row1 in list_words_rev2:
    list_tmp = list(itertools.combinations(row1,2))
    for row2 in list_tmp:
        list_edges.append(list(row2))

183015


In [52]:
writecsv(list_edges, "./files/rakuten_corpus/rakuten_corpus_edgelist_window1.csv")

### 共起グラフ+windowでのエッジリスト作成

In [16]:
# windowサイズ
window = 1
# weight
weight = 0.2
list_sentences = readtsv("./files/rakuten_corpus/annotation01_tsukuba_corpus_20140930.tsv")
# list_list_bgの形のデータを作る
list_sentences_rev = []
list_tmp = []
for row in list_sentences:
    if row[5] != '':
        list_tmp.append(row[5])
    else:
        list_sentences_rev.append(list_tmp)
        list_tmp = []
else:
    list_sentences_rev.append(list_tmp)

list_words = [[parsing(sentence) for sentence in row] for row in list_sentences_rev]

# 各単語の頻度を数え上げる
list_words_collection = []
for row in list_words:
    for sentence in row:
        list_words_collection.extend(sentence)
list_words_collection = collections.Counter(list_words_collection).items()

# 除去語リストを作成
list_words_remove = []
for row in list_words_collection:
    if row[1] < 3 or row[1] > 300:
        list_words_remove.append(row[0])

# 除去語リスト内の単語を削除
list_words_rev = []
for row in list_words:
    list_words_tmp1 = []
    for sentence in row:
        list_words_tmp2 = []
        for word in sentence:
            if word in list_words_remove:
                pass
            else:
                list_words_tmp2.append(word)
        list_words_tmp1.append(list_words_tmp2)
    list_words_rev.append(list_words_tmp1)

list_edges = []
# windowのサイズに従ったbgを作り直す
for row in list_words_rev:
    len_num = len(row)
    for i, sentence in enumerate(row):
        list_tmp = []
        for num in range(i-window,i+window+1):
            if num < 0 or num >= len_num:
                continue
            else:
                if num == i:
                    list_tmp = list(itertools.combinations(row[num],2))
                    list_edges.extend([[tmp[0], tmp[1], 1] for tmp in list_tmp])
                else:
                    list_tmp = list(itertools.product(row[num], row[i]))
                    list_edges.extend([[tmp[0], tmp[1], weight] for tmp in list_tmp])

dict_edges = {}
for row in list_edges:
    tuple_tmp = tuple(sorted([row[0], row[1]]))
    if tuple_tmp in dict_edges:
        dict_edges[tuple_tmp] += row[2]
    else:
        dict_edges[tuple_tmp] = row[2]

list_master = [[key[0], key[1], value] for key, value in dict_edges.items()]

In [17]:
writecsv(list_master, "./files/rakuten_corpus/rakuten_corpus_edgelist_window_1_0.2.csv")