<a href="https://colab.research.google.com/github/hashk1/nlp-100-knock-2020-rev2/blob/main/05-%E4%BF%82%E3%82%8A%E5%8F%97%E3%81%91%E8%A7%A3%E6%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第5章: 係り受け解析

In [None]:
# ライブラリ読み込み
import pydot
from IPython.display import Image
import re
import os
os.environ["LD_LIBRARY_PATH"] += ":/usr/local/lib"

In [None]:
# MeCabとIPAフォントのインストール
! apt install -y mecab libmecab-dev mecab-ipadic-utf8 fonts-ipafont-gothic

In [None]:
# CRF++のインストール
! wget -c -O CRF++-0.58.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7QVR6VXJ5dWExSTQ"
! tar zxvf CRF++-0.58.tar.gz
%cd CRF++-0.58
! ./configure && make && make install
%cd ..

In [None]:
# CaboChaのインストール
! curl -sc /tmp/cookie "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7SDd1Q1dUQkZQaUU" > /dev/null && curl -Lb /tmp/cookie "https://drive.google.com/uc?export=download&confirm=$(awk '/_warning_/ {print $NF}' /tmp/cookie)&id=0B4y35FiV1wh7SDd1Q1dUQkZQaUU" -o cabocha-0.69.tar.bz2
! tar jxvf cabocha-0.69.tar.bz2
%cd cabocha-0.69
! ./configure --with-mecab-config=$(which mecab-config) --with-charset=UTF8 && make && make check && make install
%cd ..

In [None]:
# データ取得
! wget -c https://nlp100.github.io/data/ai.ja.zip

In [None]:
# 係り受け解析
! unzip -p ai.ja.zip | cabocha -f1 > ai.ja.txt.parsed

### 40. 係り受け解析結果の読み込み（形態素）

In [None]:
class Morph:
    def __init__(self, d):
        self.surface = d["surface"]
        self.base = d["base"]
        self.pos = d["pos"]
        self.pos1 = d["pos1"]

In [None]:
d = []
with open("ai.ja.txt.parsed") as f:
    for lines in f.read().split("EOS\n"):
        morphs = []
        for line in  lines.rstrip().split("\n"):
            if line == "" or line[0] == "*":
                continue
            surface, attr = line.split("\t")
            if surface == "":
                continue
            attr = attr.strip().split(",")
            morphs.append(
                Morph({
                    "surface": surface,
                    "base": attr[6],
                    "pos": attr[0],
                    "pos1": attr[1]
                })
            )
        if len(morphs) > 0:
            d.append(morphs)

i = 2
[vars(d[i][j]) for j in range(len(d[i]))]

### 41. 係り受け解析結果の読み込み（文節・係り受け）

In [None]:
class Chunk:
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []

In [None]:
d = []
with open("ai.ja.txt.parsed") as f:
    for lines in f.read().split("EOS\n"):
        morphs = []
        chunks = []
        dst = None
        for line in  lines.rstrip().split("\n"):
            if line == "":
                #chunks.append(Chunk(morphs, dst))
                #morphs = []
                continue
            elif line[0] == "*":
                if len(morphs) > 0:
                    chunks.append(Chunk(morphs, dst))
                    morphs = []
                dst = int(line.split(" ")[2].replace("D", ""))
                continue
            surface, attr = line.split("\t")
            if surface == "":
                continue
            attr = attr.strip().split(",")
            morphs.append(
                Morph({
                    "surface": surface,
                    "base": attr[6],
                    "pos": attr[0],
                    "pos1": attr[1]
                })
            )
            
        if len(morphs) > 0:
            chunks.append(Chunk(morphs, dst))

        for i, chunk in enumerate(chunks):
            if chunk.dst is None or chunk.dst < 0:
                continue
            chunks[chunk.dst].srcs.append(i)
        if len(chunks) > 0:
            d.append(chunks)

### 42. 係り元と係り先の文節の表示

In [None]:
for chunks in d:
    for chunk in chunks:
        if chunk.dst is None or chunk.dst < 0:
            continue
        left = "".join([m.surface for m in chunk.morphs  if m.pos != "記号"])
        right = "".join([m.surface for m in chunks[chunk.dst].morphs  if m.pos != "記号"])
        print("{}\t{}".format(left, right))

### 43. 名詞を含む文節が動詞を含む文節に係るものを抽出

In [None]:
for chunks in d:
    for chunk in chunks:
        if chunk.dst is None or chunk.dst < 0:
            continue
        if "名詞" in [m.pos for m in chunk.morphs  if m.pos != "記号"] and "動詞" in [m.pos for m in chunks[chunk.dst].morphs  if m.pos != "記号"]:
            left = "".join([m.surface for m in chunk.morphs  if m.pos != "記号"])
            right = "".join([m.surface for m in chunks[chunk.dst].morphs  if m.pos != "記号"])
            print("{}\t{}".format(left, right))

### 44. 係り受け木の可視化

In [None]:
n_target = 5

pairs = []
chunks = d[n_target]
for chunk in chunks:
    if chunk.dst is None or chunk.dst < 0:
        continue
    left = "".join([m.surface for m in chunk.morphs  if m.pos != "記号"])
    right = "".join([m.surface for m in chunks[chunk.dst].morphs  if m.pos != "記号"])
    pairs.append([left, right])

graph = pydot.graph_from_edges(pairs)

# 文字化けをなおす
for node in graph.get_nodes():
  node.set_fontname("IPAGothic")

Image(graph.create_png())

### 45. 動詞の格パターンの抽出

In [None]:
with open("pattern.txt", "w") as f:
    for chunks in d:
        d1 = {}
        for chunk in chunks:
            if chunk.dst is None or chunk.dst < 0:
                continue
            verbs = [m.base for m in chunks[chunk.dst].morphs  if m.pos == "動詞"]
            particles = [m.surface for m in chunk.morphs  if m.pos == "助詞"]
            for verb in verbs:
                d1[verb] = d1.get(verb, []) + particles
        for k, v in d1.items():
            f.write("{}\t{}\n".format(k, " ".join(v)))

In [None]:
# 作り出す
! cat pattern.txt | grep "作り出す"

In [None]:
# 行う
! cat pattern.txt| grep "行う" | cut -d $'\t' -f 2| xargs -n 1 | sort | uniq -c | sort -k1,1nr

In [None]:
# なる
! cat pattern.txt| grep "なる" | cut -d $'\t' -f 2| xargs -n 1 | sort | uniq -c | sort -k1,1nr

In [None]:
# 与える
! cat pattern.txt| grep "与える" | cut -d $'\t' -f 2| xargs -n 1 | sort | uniq -c | sort -k1,1nr

### 46. 動詞の格フレーム情報の抽出

In [None]:
with open("pattern2.txt", "w") as f:
    for chunks in d:
        d1 = {}
        d2 = {}
        for chunk in chunks:
            if chunk.dst is None or chunk.dst < 0:
                continue
            verbs = [m.base for m in chunks[chunk.dst].morphs  if m.pos == "動詞"]
            particles = [m.surface for m in chunk.morphs  if m.pos == "助詞"]
            words = []
            if len(particles) > 0:
                words.append("".join([m.surface for m in chunk.morphs if m.pos != "記号"]))
            for verb in verbs:
                d1[verb] = d1.get(verb, []) + particles
                d2[verb] = d2.get(verb, []) + words
        for k, v in d1.items():
            if len(v) > 0:
                f.write("{}\t{}\t{}\n".format(k, " ".join(v), "　".join(d2[k])))

In [None]:
# 作り出す
! cat pattern2.txt | grep "作り出す"

### 47. 機能動詞構文のマイニング

In [None]:
with open("pattern3.txt", "w") as f:
    for chunks in d:
        d1 = {}
        d2 = {}
        for n, chunk in enumerate(chunks):
            if chunk.dst is None or chunk.dst < 0:
                continue
                
            lefts = []
            for i in range(1, len(chunk.morphs)):
                if chunk.morphs[i-1].pos1 == "サ変接続" and chunk.morphs[i].surface == "を":
                    lefts.append("{}{}".format(chunk.morphs[i-1].surface, chunk.morphs[i].surface))            
            rights =  [m.base for m in chunks[chunk.dst].morphs if m.pos == "動詞"]
            
            particles = []
            words = []
            for src in chunks[chunk.dst].srcs:
                if src == n:
                    continue
                ps = [m.surface for m in chunks[src].morphs  if m.pos == "助詞"]
                particles += ps
                if len(ps) > 0:
                    words.append("".join([m.surface for m in chunks[src].morphs if m.pos != "記号"]))
                           
            for left in lefts:
                for right in rights:
                    verb = "{}{}".format(left, right)
                    d1[verb] = d1.get(verb, []) + particles
                    d2[verb] = d2.get(verb, []) + words  
                    
        for k, v in d1.items():
            if len(v) > 0:
                f.write("{}\t{}\t{}\n".format(k, " ".join(v), "　".join(d2[k])))  

In [None]:
# 学習を行う
! cat pattern3.txt | grep "学習を行う"

### 48. 名詞から根へのパスの抽出

In [None]:
with open("path.txt", "w") as f:
    for chunks in d:
        for i, chunk in enumerate(chunks):
            dst = i
            words = []
            if "名詞" not in [m.pos for m in chunks[dst].morphs  if m.pos != "記号"]:
                continue
            while dst >= 0:
                words.append("".join([m.surface for m in chunks[dst].morphs if m.pos != "記号"]))
                if [m.surface for m in chunks[dst].morphs][-1] == "。":
                    dst = -1
                else: 
                    dst = chunks[dst].dst
                
            if len(words) > 1:
                f.write(" -> ".join(words) + "\n")

In [None]:
# 作り出した
! cat path.txt | grep "作り出した"

### 49. 名詞間の係り受けパスの抽出

In [None]:
with open("path2.txt", "w") as f:
    for chunks in d:
        paths = []
        for i, chunk in enumerate(chunks):
            dst = i
            path = []
            if "名詞" not in [m.pos for m in chunks[dst].morphs  if m.pos != "記号"]:
                continue
            while dst >= 0:
                path.append(dst)
                if [m.surface for m in chunks[dst].morphs][-1] == "。":
                    dst = -1
                else: 
                    dst = chunks[dst].dst
            if len(path) > 1:
                paths.append(path)
 
        if len(paths) <= 0:
            continue
        tails = set([max(a) for a in paths])
        
        n0 = 0
        for tail in tails:
            n1 = tail + 1
            path1 = []
            for i in range(n0, n1):
                if "名詞" in [m.pos for m in chunks[i].morphs  if m.pos != "記号"]:
                    path1.append(i)
        
            pairs = [(i, j) for i in path1 for j in path1 if i < j]
            for i, j in pairs:
                for path in paths:
                    if path[0] == i and j in path[1:-1]:
                        path2 = [re.sub("X.*X", "X", "".join(["X" if m.pos == "名詞" else m.surface for m in chunks[i].morphs  if m.pos != "記号"]))]
                        path2 += ["".join([m.surface for m in chunks[e].morphs  if m.pos != "記号"]) for e in path[1:-1] if e < j]
                        path2 += [re.sub("Y.*Y", "Y","".join(["Y" if m.pos == "名詞" else m.surface for m in chunks[j].morphs  if m.pos != "記号"]))]
                        f.write(" -> ".join(path2) + "\n")
                        break
                    elif path[0] == i and path[-1] == tail:
                        path2 = [re.sub("X.*X", "X", "".join(["X" if m.pos == "名詞" else m.surface for m in chunks[i].morphs  if m.pos != "記号"]))]
                        path2 += ["".join([m.surface for m in chunks[e].morphs  if m.pos != "記号"]) for e in path[1:-1]]
                        f.write(" -> ".join(path2) + " | ")
                    elif path[0] == j and path[-1] == tail:
                        path2 = [re.sub("Y.*Y", "Y", "".join(["Y" if m.pos == "名詞" else m.surface for m in chunks[j].morphs  if m.pos != "記号"]))]
                        path2 += ["".join([m.surface for m in chunks[e].morphs  if m.pos != "記号"]) for e in path[1:-1]]
                        f.write(" -> ".join(path2) + " | ")
                        f.write("".join([m.surface for m in chunks[tail].morphs  if m.pos != "記号"]) + "\n")
                        break
                        
            n0 = n1

In [None]:
! cat path2.txt | grep -n "作り出した" | head -n 5

In [None]:
! cat path2.txt | sed -n '2579,2593p'