# 第4章: 形態素解析

In [None]:
# ライブラリ読み込み
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rcParams['font.family'] = 'AppleGothic'

In [None]:
# データ取得
! wget https://nlp100.github.io/data/neko.txt

In [None]:
# 形態素解析
! mecab neko.txt > neko.txt.mecab

### 30. 形態素解析結果の読み込み

In [None]:
d = []
with open("neko.txt.mecab") as f:
    for lines in f.read().split("EOS\n"):
        morphs = []
        for line in  lines.rstrip().split("\n"):
            if line == "": 
                continue
            surface, attr = line.split("\t")
            if surface == "":
               continue
            attr = attr.strip().split(",")
            morphs.append({
                "surface": surface,
                "base": attr[6],
                "pos": attr[0],
                "pos1": attr[1]
            })
        if len(morphs) > 0:
            d.append(morphs)

d

### 31. 動詞

In [None]:
verb_surfaces = []
for y in d:
    for x in y:
        if x["pos"] == "動詞":
            verb_surfaces.append(x["surface"])
            
verb_surfaces

### 32. 動詞の基本形

In [None]:
verb_bases = []
for y in d:
    for x in y:
        if x["pos"] == "動詞":
            verb_bases.append(x["base"])
            
verb_bases

### 33. 「AのB」

In [None]:
noun_of_nouns = []
for y in d:
    for i in range(2, len(y)):
        if y[i-2]["pos"] == "名詞" and y[i-1]["surface"] == "の" and y[i]["pos"] == "名詞":
            noun_of_nouns.append("{}{}{}".format(y[i-2]["surface"], y[i-1]["surface"], y[i]["surface"]))
            
noun_of_nouns

### 34. 名詞の連接

In [None]:
noun_articulations = []
for y in d:
    xs = []
    for x in y:
        if x["pos"] == "名詞":
            xs.append(x["surface"])
        else:
            if len(xs) > 1:
                noun_articulations.append("".join(xs))
                xs = []
                
noun_articulations

### 35. 単語の出現頻度

In [None]:
wordcounts = {}
for y in d:
    for x in y:
        wordcounts[x["surface"]] = wordcounts.get(x["surface"], 0) + 1
        
wordcounts = sorted(wordcounts.items(), key = lambda x:-x[1])
wordcounts

### 36. 頻度上位10語

In [None]:
x = [x_ for x_, _ in wordcounts[:10]]
y = [y_ for _,  y_ in wordcounts[:10]]
plt.bar(x, y)

### 37. 「猫」と共起頻度の高い上位10語

In [None]:
co_occurrences = {}
for y in d:
    flag = False
    for x in y:
        if x["surface"] == "猫":
            flag = True
            break
    if flag:
        for x in y:
            co_occurrences[x["surface"]] = co_occurrences.get(x["surface"], 0) + 1
del co_occurrences["猫"]

co_occurrences = sorted(co_occurrences.items(), key = lambda x:-x[1])

x = [x_ for x_, _ in co_occurrences[:10]]
y = [y_ for _,  y_ in co_occurrences[:10]]
plt.bar(x, y)

### 38. ヒストグラム

In [None]:
x = [x_ for _, x_ in wordcounts]
y = range(1, len(wordcounts)+1)
plt.plot(x, y)

### 39. Zipfの法則

In [None]:
plt.xscale("log")
plt.yscale("log")
plt.plot(x, y)