In [23]:
# coding: utf-8
"""
・cabochaによる構文取得
・頻度を数え上げる
・bag of wordsを作成する
"""
import CaboCha
import xml.etree.ElementTree as ET
from collections import Counter
import csv
import matplotlib.pyplot as plt
%matplotlib inline

# csvファイルの読み込み
def readcsv(path):
    f = open(path, "rb")
    dataReader = csv.reader(f)
    arr = [row for row in dataReader]
    return arr

def parsing(sentence):
    # cabochaのインスタンス化
    c = CaboCha.Parser()
    # 構文解析の結果をxmlで受け取る
    xml = c.parse(sentence).toString(CaboCha.FORMAT_XML)
    root = ET.fromstring(xml)
    list_chunk = root.findall(".//chunk")
    
    # 後で処理しやすいようにdict型にしておく
    dict_chunk = {}
    # 後で順位和検定をするためにbag_of_wordsの作成をしておく
    list_bag_of_words = []
    for i, chunk in enumerate(list_chunk):
        dict_chunk_tok = {"id": int(chunk.attrib["id"]),
                          "link": int(chunk.attrib["link"]),
                          "head": int(chunk.attrib["head"]),
                          "tok_id": [int(tok.attrib["id"]) for tok in chunk.findall(".//tok")],
                          "tok_pos1": [tok.attrib["feature"].split(",")[0].encode("utf-8") for tok in chunk.findall(".//tok")],
                          "tok_pos2": [tok.attrib["feature"].split(",")[1].encode("utf-8") for tok in chunk.findall(".//tok")],
                          "tok_word": [tok.attrib["feature"].split(",")[6].encode("utf-8") for tok in chunk.findall(".//tok")]}
        dict_chunk[i] = dict_chunk_tok
        list_bag_of_words.extend([tok.attrib["feature"].split(",")[6] for tok in chunk.findall(".//tok") if tok.attrib["feature"].split(",")[0].encode("utf-8") in ["名詞", "動詞", "形容詞"]])
    
    # 最終的に返す配列
    list_master = []
    for i in range(len(dict_chunk)):
        # 第２単語へのリンク
        link = int(dict_chunk[i]["link"])
        # もしリンクがなければその後の計算をしない
        if link == -1:
            continue
        
        # 第１単語について
        head1 = dict_chunk[i]["head"]
        tok_index1 = dict_chunk[i]["tok_id"].index(head1)
        word1 = dict_chunk[i]["tok_word"][tok_index1]
        main_pos1 = dict_chunk[i]["tok_pos1"][tok_index1]
        sub_pos1 = dict_chunk[i]["tok_pos2"][tok_index1]
        
        # 第２単語について
        head2 = dict_chunk[link]["head"]
        tok_index2 = dict_chunk[link]["tok_id"].index(head2)
        word2 = dict_chunk[link]["tok_word"][tok_index2]
        main_pos2 = dict_chunk[link]["tok_pos1"][tok_index2]
        sub_pos2 = dict_chunk[link]["tok_pos2"][tok_index2]
        
        # 選択した単語が「接尾語」だった場合、一つ前の単語にチェンジ
        try:
            if sub_pos1 == "接尾":
                head1 = head1 - 1
                tok_index1 = dict_chunk[i]["tok_id"].index(head1)
                word1 = dict_chunk[i]["tok_word"][tok_index1]
                main_pos1 = dict_chunk[i]["tok_pos1"][tok_index1]
                sub_pos1 = dict_chunk[i]["tok_pos2"][tok_index1]
            if sub_pos2 == "接尾":
                head2 = head2 - 1
                tok_index2 = dict_chunk[link]["tok_id"].index(head2)
                word2 = dict_chunk[link]["tok_word"][tok_index2]
                main_pos2 = dict_chunk[link]["tok_pos1"][tok_index2]
                sub_pos2 = dict_chunk[link]["tok_pos2"][tok_index2]
        except ValueError:
            # たまに接尾語が先頭に入ってる場合がある、その場合はリストに入れない
            print word1, word2
            continue
        
        # 名詞 + 動詞, 名詞 + 形容詞, 名詞 + 名詞のセットだけ取ってくる
        if set([main_pos1, main_pos2]) == set(["名詞", "形容詞"]):
            if main_pos1 == "名詞" and sub_pos1 in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]:
                list_master.append([word1 + "," + word2, sorted([head1, head2])])
            elif main_pos2 == "名詞" and sub_pos2 in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]:
                list_master.append([word2 + "," + word1, sorted([head2, head1])])
        
        elif set([main_pos1, main_pos2]) == set(["名詞", "動詞"]):
            if main_pos1 == "名詞" and sub_pos1 in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]:
                list_master.append([word1 + "," + word2, sorted([head1, head2])])
            elif main_pos2 == "名詞" and sub_pos2 in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]:
                list_master.append([word2 + "," + word1, sorted([head2, head1])])
        
        elif set([main_pos1, main_pos2]) == set(["名詞", "名詞"]):
            if sub_pos1 in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"] and sub_pos2 in ["一般", "固有名詞", "サ変接続", "形容動詞語幹"]:
                list_master.append([word1 + "," + word2, sorted([head1, head2])])
    
    # 跨いでいる部分を削除
    list_master_rev = []
    for row_i in list_master:
        list_i = range(row_i[1][0], row_i[1][1]+1)
        for row_j in list_master:
            list_j = range(row_j[1][0], row_j[1][1]+1)
            if set(list_j).issubset(list_i) and len(list_j) < len(list_i):
                break
        else:
            list_master_rev.append(row_i)
    
    return list_master_rev, list_bag_of_words

In [2]:
import CaboCha

# c = CaboCha.Parser("");
c = CaboCha.Parser()

sentence = "二階の居室だったのですが、部屋の日当たりは良好"

print c.parseToString(sentence)

tree =  c.parse(sentence)

print tree.toString(CaboCha.FORMAT_XML)

              二階の-D      
  居室だったのですが、-----D
                  部屋の-D |
                日当たりは-D
                        良好
EOS

<sentence>
 <chunk id="0" link="1" rel="D" score="4.751018" head="1" func="2">
  <tok id="0" feature="名詞,数,*,*,*,*,二,ニ,ニ">二</tok>
  <tok id="1" feature="名詞,接尾,助数詞,*,*,*,階,カイ,カイ">階</tok>
  <tok id="2" feature="助詞,連体化,*,*,*,*,の,ノ,ノ">の</tok>
 </chunk>
 <chunk id="1" link="4" rel="D" score="-2.090361" head="6" func="8">
  <tok id="3" feature="名詞,一般,*,*,*,*,居室,キョシツ,キョシツ">居室</tok>
  <tok id="4" feature="助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ">だっ</tok>
  <tok id="5" feature="助動詞,*,*,*,特殊・タ,基本形,た,タ,タ">た</tok>
  <tok id="6" feature="名詞,非自立,一般,*,*,*,の,ノ,ノ">の</tok>
  <tok id="7" feature="助動詞,*,*,*,特殊・デス,基本形,です,デス,デス">です</tok>
  <tok id="8" feature="助詞,接続助詞,*,*,*,*,が,ガ,ガ">が</tok>
  <tok id="9" feature="記号,読点,*,*,*,*,、,、,、">、</tok>
 </chunk>
 <chunk id="2" link="3" rel="D" score="1.654702" head="10" func="11">
  <tok id="10" feature="名詞,一般,*,*,*,*,部屋,ヘヤ,ヘヤ">部屋</tok>
  <tok id="11" fe

In [11]:
"""
助詞、助動詞をchunkから削除
"""

def debug_parsing(sentence):
    # cabochaのインスタンス化
    c = CaboCha.Parser()
    # 構文解析の結果をxmlで受け取る
    xml = c.parse(sentence).toString(CaboCha.FORMAT_XML)
    root = ET.fromstring(xml)
    list_chunk = root.findall(".//chunk")
    
    # 後で処理しやすいようにdict型にしておく
    dict_chunk = {}
    for i, chunk in enumerate(list_chunk):
        dict_chunk_tok = {"id": int(chunk.attrib["id"]),
                          "link": int(chunk.attrib["link"]),
                          "head": int(chunk.attrib["head"]),
                          "tok_id": [],
                          "tok_pos1": [],
                          "tok_pos2": [],
                          "tok_word": []}
        for tok in chunk.findall(".//tok"):
            features = tok.attrib["feature"].encode("utf-8").split(",")
            if features[0] in ["助詞", "助動詞", "記号"]:
                pass
            else:
                dict_chunk_tok["tok_id"].append(tok.attrib["id"])
                dict_chunk_tok["tok_pos1"].append(features[0])
                dict_chunk_tok["tok_pos2"].append(features[1])
                dict_chunk_tok["tok_word"].append(features[6])
    
        dict_chunk[i] = dict_chunk_tok
        
    return dict_chunk

In [13]:
# csvファイルから文章を読み込んで、「。」を基準にセンテンスごとに分割する
list_sentences = readcsv("./files/file_1.csv")
list_sentences = [row[1] for row in list_sentences]
list_sentences_rev = []
for row in list_sentences:
    list_sentence = row.split("。")
    list_sentences_rev.extend(list_sentence)

In [22]:
dict_sentence = debug_parsing(list_sentences_rev[8])
for i in range(len(dict_sentence)):
    dict_tok = dict_sentence[i]
    print "=====chunk:" + str(i) + "====="
    for j in range(len(dict_tok["tok_id"])):
        print dict_tok["tok_word"][j], dict_tok["tok_pos1"][j], dict_tok["tok_pos2"][j]

=====chunk: 0=====
日々 名詞 副詞可能
退屈 名詞 サ変接続
=====chunk: 1=====
なる 動詞 自立
よう 名詞 非自立
=====chunk: 2=====
趣向 名詞 一般
=====chunk: 3=====
凝らす 動詞 自立
れる 動詞 接尾
=====chunk: 4=====
* 名詞 一般
=====chunk: 5=====
ある 動詞 自立
