In [None]:
import re

%matplotlib inline
import matplotlib
import seaborn as sns; sns.set()

datapath = "data/neko.txt.mecazb"

## 30. 形態素解析結果の読み込み
形態素解析結果（neko.txt.mecab）を読み込むプログラムを実装せよ．ただし，各形態素は表層形（surface），基本形（base），品詞（pos），品詞細分類1（pos1）をキーとするマッピング型に格納し，1文を形態素（マッピング型）のリストとして表現せよ．第4章の残りの問題では，ここで作ったプログラムを活用せよ．

### memo
フォーマット: `表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音`

In [None]:
# dict_keys
surface, base, pos, pos1 = "表層形", "基本形", "品詞", "品詞細分類"
prg = re.compile(r"(?P<sur>.+?)\t(?P<pos>[^,]+),(?P<pos1>[^,]+),([^,]+,){4}(?P<base>[^,]+).*")

# generator
def _neko_morph_():
    with open(datapath) as f:
        for line in f:
            res = prg.match(line)
            if res:
                yield {surface: res.group("sur"), base: res.group("base"), pos: res.group("pos"), pos1: res.group("pos1")}

## 31. 動詞
動詞の表層形をすべて抽出せよ．

In [None]:
#for morph in neko_morph:
#    if morph[base] == "動詞":
#        print(morph[surface])

neko_morph = _neko_morph_()
verbs = [morph[surface] for morph in neko_morph if morph[pos] == "動詞"]
print(verbs)

## 32. 動詞の原形
動詞の原形をすべて抽出せよ．

In [None]:
neko_morph = _neko_morph_()
original_verbs = [morph[base] for morph in neko_morph if morph[pos] == "動詞"]
print(original_verbs)

## 33. サ変名詞
サ変接続の名詞をすべて抽出せよ．

In [None]:
neko_morph = _neko_morph_()
sahen_noun = [morph[surface] for morph in neko_morph if morph[pos1] == "サ変接続"]
print(sahen_noun)

## 34. 「AのB」
2つの名詞が「の」で連結されている名詞句を抽出せよ．

### MEMO
名詞->連体化の「の」->名詞　を探す

In [None]:
# dict_keys
surface, base, pos, pos1 = "表層形", "基本形", "品詞", "品詞細分類"
prg = re.compile(r"(?P<sur>.+?)\t(?P<pos>[^,]+),(?P<pos1>[^,]+),([^,]+,){4}(?P<base>[^,]+).*")

# search
compressed = ""
morph_list = []
with open(datapath) as f:
    for line in f:
        res = prg.match(line)
        if res:
            morph_list.append(res.group("sur"))
            if res.group("pos") == "名詞":
                compressed += "n"
            elif res.group("pos1") == "連体化":
                compressed += "t"
            else:
                compressed += "x"
    
    for match in re.finditer(r"ntn",compressed):
        start, end = match.start(), match.end()
        print(''.join(morph_list[start:end]))
        

## 35. 名詞の連接
名詞の連接（連続して出現する名詞）を最長一致で抽出せよ．

In [None]:
# dict_keys
surface, base, pos, pos1 = "表層形", "基本形", "品詞", "品詞細分類"
prg = re.compile(r"(?P<sur>.+?)\t(?P<pos>[^,]+),(?P<pos1>[^,]+),([^,]+,){4}(?P<base>[^,]+).*")

# search
compressed = ""
morph_list = []
with open(datapath) as f:
    for line in f:
        res = prg.match(line)
        if res:
            morph_list.append(res.group("sur"))
            if res.group("pos") == "名詞":
                compressed += "n"
            elif res.group("pos1") == "連体化":
                compressed += "t"
            else:
                compressed += "x"
    
    longest, longest_phrase = 0, ""
    for match in re.finditer(r"nn+",compressed):
        start, end = match.start(), match.end()
        phrase = '_'.join(morph_list[start:end])
        print(phrase)
        if longest <= end-start:
            longest = end-start
            longest_phrase = phrase
    
    print("longest: ", longest_phrase)

## 36. 単語の出現頻度
文章中に出現する単語とその出現頻度を求め，出現頻度の高い順に並べよ．

### MEMO
UNIX:  
`cut -f 1 data/neko.txt.mecab | sort | uniq -c | sort -r -k 1 > word_rank`

In [None]:
import collections

with open(datapath,'r') as f:
    l = []
    for line in f:
        elms = line.split()
        l.append(elms[0])
    
    c = collections.Counter(l)
    
print('\n'.join([' '.join([elm, str(n)]) for elm, n in c.most_common() if elm != "EOS"]))

## 37. 頻度上位10語
出現頻度が高い10語とその出現頻度をグラフ（例えば棒グラフなど）で表示せよ．

In [None]:
import collections
import matplotlib.pyplot as plt

with open(datapath,'r') as f:
    l = []
    for line in f:
        elms = line.split()
        l.append(elms[0])
    
    c = collections.Counter(l)
    c.pop("EOS")
    
xy = c.most_common(10)
plt.bar(range(10), list(map(lambda t: t[1], xy)), tick_label=list(map(lambda t: t[0], xy)))
plt.show()

## 38. ヒストグラム
単語の出現頻度のヒストグラム（横軸に出現頻度，縦軸に出現頻度をとる単語の種類数を棒グラフで表したもの）を描け．

In [None]:
import collections
import numpy as np
import matplotlib.pyplot as plt

with open(datapath,'r') as f:
    l = []
    for line in f:
        elms = line.split()
        l.append(elms[0])
    
    c = collections.Counter(l)
    c.pop("EOS")
    
counts = c.values()
plt.hist(counts, bins = np.logspace(0,4,12))
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.title("histogram")
plt.xlabel("frequency(log)")
plt.ylabel("number of words(log)")
plt.show()

## 39. Zipfの法則
単語の出現頻度順位を横軸，その出現頻度を縦軸として，両対数グラフをプロットせよ．

In [None]:
import collections
import numpy as np
import matplotlib.pyplot as plt

with open(datapath,'r') as f:
    l = []
    for line in f:
        elms = line.split()
        l.append(elms[0])
    
    c = collections.Counter(l)
    c.pop("EOS")
    
counts = c.values()
print()
plt.plot(range(len(counts)), sorted(counts)[::-1])
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.title("Zipf's law")
plt.xlabel("ranking(log)")
plt.ylabel("number of occurances(log)")
plt.show()