학습이 끝난 HMM 의 parameters 인 emission prob. 와 transition prob. 의 log 값이 주어졌다고 가정합니다.

In [18]:
from pprint import pprint

emission = {
    'Adjective': {'이': 0.1, '짧': 0.1},
    'Eomi': {'다': 0.1, '았다': 0.1, '었다': 0.1},
    'Josa': {'는': 0.15, '다': 0.05, '도': 0.1, '은': 0.2, '이': 0.1},
    'Noun': {'시작': 0.1, '예시': 0.1, '이': 0.15, '이것': 0.1},
    'Verb': {'입': 0.1, '하': 0.1}
}

transition = {
    ('Adjective', 'Noun'): 0.1,
    ('Josa', 'Adjective'): 0.1,
    ('Josa', 'Noun'): 0.2,
    ('Josa', 'Verb'): 0.1,
    ('Noun', 'Adjective'): 0.05,
    ('Noun', 'Josa'): 0.1,
    ('Noun', 'Noun'): 0.1,
    ('Noun', 'Verb'): 0.05,
    ('Verb', 'Noun'): 0.1
}

begin = {
    'Noun': 0.2,
    'Verb': 0.1,
    'Adjective': 0.1
}

_max_word_len = max(len(w) for words in emission.values() for w in words)
_min_emission = min(s for words in emission.values() for s in words.values()) - 0.05
_min_transition = min(transition.values()) - 0.05

print(_max_word_len) # 2
print(_min_emission) # 0.0
print(_min_transition) # 0.0

2
0.0
0.0


In [5]:
class TrainedHMMTagger:
    def __init__(self, transition, emission, begin,
        begin_state='BOS', end_state='EOS', unk_state='Unk'):

        self.transition = transition
        self.emission = emission
        self.begin = begin

    def tag(self, sentence):
        raise NotImplemented

## Lookup

In [18]:
def sentence_lookup(sentence):
    sent = []
    for eojeol in sentence.split():
        sent += eojeol_lookup(eojeol, len(sent))
    return sent

In [19]:
def eojeol_lookup(eojeol, offset=0):
    n = len(eojeol)
    pos = [[] for _ in range(n)]
    for b in range(n):
        for r in range(1, _max_word_len+1):
            e = b + r
            if e > n:
                continue
            surface = eojeol[b:e]
            for tag in get_pos(surface):
                pos[b].append((surface, tag, tag, b+offset, e+offset))
    return pos

def get_pos(sub):
    tags = []
    for tag, words in emission.items():
        if sub in words:
            tags.append(tag)
    return tags

In [20]:
eojeol_lookup("예시이다")

[[('예시', 'Noun', 'Noun', 0, 2)],
 [],
 [('이', 'Adjective', 'Adjective', 2, 3),
  ('이', 'Josa', 'Josa', 2, 3),
  ('이', 'Noun', 'Noun', 2, 3)],
 [('다', 'Eomi', 'Eomi', 3, 4), ('다', 'Josa', 'Josa', 3, 4)]]

In [21]:
sentence_lookup("이것은 예시이다")

[[('이', 'Adjective', 'Adjective', 0, 1),
  ('이', 'Josa', 'Josa', 0, 1),
  ('이', 'Noun', 'Noun', 0, 1),
  ('이것', 'Noun', 'Noun', 0, 2)],
 [],
 [('은', 'Josa', 'Josa', 2, 3)],
 [('예시', 'Noun', 'Noun', 3, 5)],
 [],
 [('이', 'Adjective', 'Adjective', 5, 6),
  ('이', 'Josa', 'Josa', 5, 6),
  ('이', 'Noun', 'Noun', 5, 6)],
 [('다', 'Eomi', 'Eomi', 6, 7), ('다', 'Josa', 'Josa', 6, 7)]]

어간과 어미 분리를 해야한다...

In [15]:
from soynlp.lemmatizer import lemma_candidate

In [16]:
def lemmatize(word, i):
    l = word[:i]
    r = word[i:]
    lemmas = []
    len_word = len(word)
    for l_, r_ in lemma_candidate(l, r):
        word_ = l_ + ' + ' + r_
        if (l_ in emission['Verb']) and (r_ in emission['Eomi']):
            lemmas.append((word_, 'Verb', 'Eomi'))
        if (l_ in emission['Adjective']) and (r_ in emission['Eomi']):
            lemmas.append((word_, 'Adjective', 'Eomi'))
    return lemmas

In [49]:
sentence_lookup("이것은 예시였다")

[[('이', 'Adjective', 'Adjective', 0, 1),
  ('이', 'Josa', 'Josa', 0, 1),
  ('이', 'Noun', 'Noun', 0, 1),
  ('이것', 'Noun', 'Noun', 0, 2)],
 [],
 [('은', 'Josa', 'Josa', 2, 3)],
 [('예시', 'Noun', 'Noun', 3, 5)],
 [],
 [],
 [('다', 'Eomi', 'Eomi', 6, 7), ('다', 'Josa', 'Josa', 6, 7)]]

In [20]:
lemmatize('였다',1)

[('이 + 었다', 'Adjective', 'Eomi')]

In [22]:
lemmatize('있다',2)

[]

In [104]:
def eojeol_lookup(eojeol, offset=0):
    n = len(eojeol)
    pos = [[] for _ in range(n)]
    for b in range(n):
        for r in range(1, _max_word_len+1):
            e = b + r
            if e > n:
                continue
            surface = eojeol[b:e]
            for tag in get_pos(surface):
                pos[b].append((surface, tag, tag, b+offset, e+offset))
            # 어절 단위로 쪼개는 과정
            for i in range(1, e-b):
                try:
                    lemmas = lemmatize(surface, i)
                    if lemmas:
                        pos[b].extend([lemma + (b+offset, e+offset) for lemma in lemmas])
                except TypeError: # lemmatize 할 수 없는 경우는 넘어감
#                     print(surface, i)
                    continue
    return pos

In [62]:
sentence_lookup("이것은 예시였다")

[[('이', 'Adjective', 'Adjective', 0, 1),
  ('이', 'Josa', 'Josa', 0, 1),
  ('이', 'Noun', 'Noun', 0, 1),
  ('이것', 'Noun', 'Noun', 0, 2)],
 [],
 [('은', 'Josa', 'Josa', 2, 3)],
 [('예시', 'Noun', 'Noun', 3, 5)],
 [],
 [('이 + 었다', 'Adjective', 'Eomi', 5, 7)],
 [('다', 'Eomi', 'Eomi', 6, 7), ('다', 'Josa', 'Josa', 6, 7)]]

## Generate (word, tag) graph

앞의 단어의 end index와 뒤 단어의 begin index가 같은 경우 이 둘을 연결한다.

```python
links = []
for word in sent[:-1]:
    for word in words:
        begin = word[3]
        end = word[4]
        for adjacent in sent[end]:
            links.append((word, adjacent))
```

In [67]:
def get_nonempty_first(sent, end, offset=0):
    """offset 이후의 지점에서 sent[i]가 empty가 아닌 가장 빠른 지점 리턴"""
    for i in range(offset, end):
        if sent[i]:
            return i
    return offset

문장 끝 의미하는 EOS 추가

```python
sent = sentence_lookup(sentence)
n_char = len(sent) + 1
eos = ('EOS', 'EOS', 'EOS', n_char-1, n_char)
sent.append([eos])
```

In [74]:
def generate_link(sentence):
    
    chars = sentence.replace(' ', '')
    sent = sentence_lookup(sentence)
    n_char = len(sent) + 1
    
    eos = ('EOS', 'EOS', 'EOS', n_char-1, n_char)
    sent.append([eos])
    
    # 첫 단어 위치
    i = get_nonempty_first(sent, n_char)
    
    if i > 0:
        sent[0].append((chars[:i], 'Unk', 'Unk', 0, i))
        
    links = []
    for words in sent[:-1]:
        for word in words:
            begin = word[3]
            end = word[4]
            # 이을 단어가 없는 경우 'Unk' 추가
            if not sent[end]:
                b = get_nonempty_first(sent, n_char, end)
                unk = (chars[end:b], 'Unk', 'Unk', end, b)
                links.append((word, unk))
            else:
                # 아닌 경우 현재 단어의 끝점에서 시작하는 단어들과 이음
                for adjacent in sent[end]:
                    links.append((word, adjacent))
    
    # 'Unk'에서 시작하는 edge 생성
    unks = {to_node for _, to_node in links if to_node[1] == 'Unk'}
    for unk in unks:
        for adjacent in sent[unk[3]]:
            links.append((unk, adjacent))
            
    bos = ('BOS', 'BOS', 'BOS', 0, 0)
    for word in sent[0]:
        links.append((bos, word))
    # 재정렬
    links = sorted(links, key=lambda x:(x[0][3], x[1][4]))
    
    return links, bos, eos

In [75]:
links, bos, eos = generate_link('이것은 예시였다')

pprint(links)

[(('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Adjective', 'Adjective', 0, 1)),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Josa', 'Josa', 0, 1)),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Noun', 'Noun', 0, 1)),
 (('이', 'Adjective', 'Adjective', 0, 1), ('것', 'Unk', 'Unk', 1, 2)),
 (('이', 'Josa', 'Josa', 0, 1), ('것', 'Unk', 'Unk', 1, 2)),
 (('이', 'Noun', 'Noun', 0, 1), ('것', 'Unk', 'Unk', 1, 2)),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이것', 'Noun', 'Noun', 0, 2)),
 (('이것', 'Noun', 'Noun', 0, 2), ('은', 'Josa', 'Josa', 2, 3)),
 (('은', 'Josa', 'Josa', 2, 3), ('예시', 'Noun', 'Noun', 3, 5)),
 (('예시', 'Noun', 'Noun', 3, 5), ('이 + 었다', 'Adjective', 'Eomi', 5, 7)),
 (('이 + 었다', 'Adjective', 'Eomi', 5, 7), ('EOS', 'EOS', 'EOS', 7, 8)),
 (('다', 'Eomi', 'Eomi', 6, 7), ('EOS', 'EOS', 'EOS', 7, 8)),
 (('다', 'Josa', 'Josa', 6, 7), ('EOS', 'EOS', 'EOS', 7, 8))]


가중치는 앞 마디로부터 지금 단어로 이동하는 transition probability와 현재 마디의 단어, 품사가 발생할 emission probability의 곱(혹은 로그의 합)을 이용한다.

In [76]:
def add_weight(links):
    
    def weight(from_node, to_node):
        morphs = to_node[0].split(' + ')
        
        # 첫 단어 점수
        w = emission.get(to_node[1], {}).get(morphs[0], _min_emission)
        w += transition.get((from_node[2], to_node[1]), _min_transition)
        
        # 두번째 단어 점수
        if len(morphs) == 2:
            w += emission.get(to_node[2], {}).get(morphs[1], _min_emission)
            w += transition.get((from_node[2], to_node[2]), _min_transition)
            
        return w
    
    graph = []
    for from_node, to_node in links:
        edge = (from_node, to_node, weight(from_node, to_node))
        graph.append(edge)
        
    return graph

In [77]:
graph = add_weight(links)
pprint(graph)

[(('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Adjective', 'Adjective', 0, 1), 0.1),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Josa', 'Josa', 0, 1), 0.1),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Noun', 'Noun', 0, 1), 0.15),
 (('이', 'Adjective', 'Adjective', 0, 1), ('것', 'Unk', 'Unk', 1, 2), 0.0),
 (('이', 'Josa', 'Josa', 0, 1), ('것', 'Unk', 'Unk', 1, 2), 0.0),
 (('이', 'Noun', 'Noun', 0, 1), ('것', 'Unk', 'Unk', 1, 2), 0.0),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이것', 'Noun', 'Noun', 0, 2), 0.1),
 (('이것', 'Noun', 'Noun', 0, 2),
  ('은', 'Josa', 'Josa', 2, 3),
  0.30000000000000004),
 (('은', 'Josa', 'Josa', 2, 3),
  ('예시', 'Noun', 'Noun', 3, 5),
  0.30000000000000004),
 (('예시', 'Noun', 'Noun', 3, 5), ('이 + 었다', 'Adjective', 'Eomi', 5, 7), 0.25),
 (('이 + 었다', 'Adjective', 'Eomi', 5, 7), ('EOS', 'EOS', 'EOS', 7, 8), 0.0),
 (('다', 'Eomi', 'Eomi', 6, 7), ('EOS', 'EOS', 'EOS', 7, 8), 0.0),
 (('다', 'Josa', 'Josa', 6, 7), ('EOS', 'EOS', 'EOS', 7, 8), 0.0)]


## 포드 알고리즘을 이용한 최단경로 찾기

In [88]:
def ford_list(E, V, S, T):
    
    # 초기화
    # (최대 가중치 + 1) * 노드 갯수
    inf = (min((weight for from_, to_, weight in E)) - 1) * len(V)
    
    # 거리들
    d = {node:0 if node == S else inf for node in V}
    # 이전 노드
    prev = {node:None for node in V}
    
    # 이터레이션
    # 무한 루프 방지
    for _ in range(len(V)):
        # 이른 정지
        changed = False
        for u, v, Wuv in E:
            d_new = d[u] + Wuv
            if d_new > d[v]:
                d[v] = d_new
                prev[v] = u
                changed = True
        if not changed:
            break
            
    # 순환 있는지 체크
    for u, v, Wuv in E:
        if d[u] + Wuv > d[v]:
            raise ValueError('Cycle exists')
            
    # 패스 탐색
    prev_ = prev[T]
    if prev_ == S:
        return path[::-1], d[T]
    
    path = [T]
    while prev_ != S:
        path.append(prev_)
        prev_ = prev[prev_]
    path.append(S)
    
    return path[::-1], d[T]

In [89]:
graph

[(('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Adjective', 'Adjective', 0, 1), 0.1),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Josa', 'Josa', 0, 1), 0.1),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이', 'Noun', 'Noun', 0, 1), 0.15),
 (('이', 'Adjective', 'Adjective', 0, 1), ('것', 'Unk', 'Unk', 1, 2), 0.0),
 (('이', 'Josa', 'Josa', 0, 1), ('것', 'Unk', 'Unk', 1, 2), 0.0),
 (('이', 'Noun', 'Noun', 0, 1), ('것', 'Unk', 'Unk', 1, 2), 0.0),
 (('BOS', 'BOS', 'BOS', 0, 0), ('이것', 'Noun', 'Noun', 0, 2), 0.1),
 (('이것', 'Noun', 'Noun', 0, 2),
  ('은', 'Josa', 'Josa', 2, 3),
  0.30000000000000004),
 (('은', 'Josa', 'Josa', 2, 3),
  ('예시', 'Noun', 'Noun', 3, 5),
  0.30000000000000004),
 (('예시', 'Noun', 'Noun', 3, 5), ('이 + 었다', 'Adjective', 'Eomi', 5, 7), 0.25),
 (('이 + 었다', 'Adjective', 'Eomi', 5, 7), ('EOS', 'EOS', 'EOS', 7, 8), 0.0),
 (('다', 'Eomi', 'Eomi', 6, 7), ('EOS', 'EOS', 'EOS', 7, 8), 0.0),
 (('다', 'Josa', 'Josa', 6, 7), ('EOS', 'EOS', 'EOS', 7, 8), 0.0)]

In [90]:
nodes = {node for edge in graph for node in edge[:2]}
nodes

{('BOS', 'BOS', 'BOS', 0, 0),
 ('EOS', 'EOS', 'EOS', 7, 8),
 ('것', 'Unk', 'Unk', 1, 2),
 ('다', 'Eomi', 'Eomi', 6, 7),
 ('다', 'Josa', 'Josa', 6, 7),
 ('예시', 'Noun', 'Noun', 3, 5),
 ('은', 'Josa', 'Josa', 2, 3),
 ('이', 'Adjective', 'Adjective', 0, 1),
 ('이', 'Josa', 'Josa', 0, 1),
 ('이', 'Noun', 'Noun', 0, 1),
 ('이 + 었다', 'Adjective', 'Eomi', 5, 7),
 ('이것', 'Noun', 'Noun', 0, 2)}

In [92]:
# 최적의 거리
path, cost = ford_list(graph, nodes, bos, eos)

In [93]:
path

[('BOS', 'BOS', 'BOS', 0, 0),
 ('이것', 'Noun', 'Noun', 0, 2),
 ('은', 'Josa', 'Josa', 2, 3),
 ('예시', 'Noun', 'Noun', 3, 5),
 ('이 + 었다', 'Adjective', 'Eomi', 5, 7),
 ('EOS', 'EOS', 'EOS', 7, 8)]

In [94]:
cost

0.9500000000000001

## 형태소 분석 결과로 만들기

In [95]:
def flatten(path):
    pos = []
    for word, tag0, tag1, b, e in path:
        morphs = word.split(' + ')
        pos.append((morphs[0], tag0))
        if len(morphs) == 2:
            pos.append((morphs[1], tag1))
    return pos
pos = flatten(path)

In [96]:
pos

[('BOS', 'BOS'),
 ('이것', 'Noun'),
 ('은', 'Josa'),
 ('예시', 'Noun'),
 ('이', 'Adjective'),
 ('었다', 'Eomi'),
 ('EOS', 'EOS')]

미등록 단어의 경우는?

In [105]:
links, bos, eos = generate_link('tt도예시였다')
graph = add_weight(links)
nodes = {node for edge in graph for node in edge[:2]}
path, cost = ford_list(graph, nodes, bos, eos)
pos = flatten(path)
pos

[('BOS', 'BOS'),
 ('tt', 'Unk'),
 ('도', 'Josa'),
 ('예시', 'Noun'),
 ('이', 'Adjective'),
 ('었다', 'Eomi'),
 ('EOS', 'EOS')]

## Unk 추정하기

In [106]:
pos[:-1]

[('BOS', 'BOS'),
 ('tt', 'Unk'),
 ('도', 'Josa'),
 ('예시', 'Noun'),
 ('이', 'Adjective'),
 ('었다', 'Eomi')]

In [107]:
begin

{'Noun': 0.2, 'Verb': 0.1, 'Adjective': 0.1}

In [108]:
{tag:prob for tag, prob in begin.items()}

{'Noun': 0.2, 'Verb': 0.1, 'Adjective': 0.1}

In [110]:
def inference_unknown(pos):
    pos_ = []
    for i, pos_i in enumerate(pos[:-1]):
        if not (pos_i[1] == 'Unk'):
            pos_.append(pos_i)
            continue
        
        # previous -> current transition 이용한 추정
        if i == 1:
            tag_prob = begin.copy()
        else:
            tag_prob = {
                tag:prob for (prev_tag, tag), prob in transition.items()
                if prev_tag == pos[i-1][1]
            }
            
        # current -> next transition 이용한 추정
        for (tag, next_tag), prob in transition.items():
            if next_tag == pos[i+1][1]:
                tag_prob[tag] = tag_prob.get(tag, 0) + prob
                
        # 전후 어떤 추정도 사용할 수 없으면 가장 많은 명사로 추정
        if not tag_prob:
            infered_tag = 'Noun'
        else:
            # 가장 확률 높은 태그 추천
            infered_tag = sorted(tag_prob, key=lambda x:-tag_prob[x])[0]
        pos_.append((pos_i[0], infered_tag))
        
    return pos_ + pos[-1:]

In [111]:
inference_unknown(pos)

[('BOS', 'BOS'),
 ('tt', 'Noun'),
 ('도', 'Josa'),
 ('예시', 'Noun'),
 ('이', 'Adjective'),
 ('었다', 'Eomi'),
 ('EOS', 'EOS')]

In [112]:
def postprocessing(pos):
    return pos[1:-1]

# Train

In [1]:
from utils import read_corpus
sj = read_corpus('data/corpus_type1_all.txt', 100)

In [2]:
sj

[[('프랑스', 'Noun'),
  ('의', 'Josa'),
  ('세계적', 'Noun'),
  ('인', 'Adjective'),
  ('의상', 'Noun'),
  ('디자이너', 'Noun'),
  ('엠마누엘', 'Noun'),
  ('웅가로', 'Noun'),
  ('가', 'Josa'),
  ('실내', 'Noun'),
  ('장식용', 'Noun'),
  ('직물', 'Noun'),
  ('디자이너', 'Noun'),
  ('로', 'Josa'),
  ('나서', 'Verb'),
  ('었다', 'Eomi')],
 [('웅가로', 'Noun'),
  ('는', 'Josa'),
  ('침실', 'Noun'),
  ('과', 'Josa'),
  ('식당', 'Noun'),
  ('욕실', 'Noun'),
  ('에서', 'Josa'),
  ('사용', 'Noun'),
  ('하는', 'Verb'),
  ('갖가지', 'Noun'),
  ('직물제품', 'Noun'),
  ('을', 'Josa'),
  ('디자인', 'Noun'),
  ('해', 'Verb'),
  ('최근', 'Noun'),
  ('파리', 'Noun'),
  ('의', 'Josa'),
  ('갤러리', 'Noun'),
  ('라파예트백화점', 'Noun'),
  ('에서', 'Josa'),
  ('색', 'Noun'),
  ('의', 'Josa'),
  ('컬렉션', 'Noun'),
  ('이라는', 'Adjective'),
  ('이름', 'Noun'),
  ('으로', 'Josa'),
  ('전시회', 'Noun'),
  ('를', 'Josa'),
  ('열', 'Verb'),
  ('었다', 'Eomi')],
 [('목욕가운', 'Noun'),
  ('부터', 'Josa'),
  ('탁자보', 'Noun'),
  ('냅킨', 'Noun'),
  ('앞치마', 'Noun'),
  ('까지', 'Josa'),
  ('그', 'Pronoun'),
  ('가', 'Josa'),


In [9]:
import json
import os
from collections import defaultdict

from utils import as_bigram_tag

def train(corpus, save_path):
    pos2words = defaultdict(lambda: defaultdict(int))
    trans = defaultdict(int)
    bos = defaultdict(int)

    # sent = [(word, tag), (word, tag), ... ] format
    for sent in corpus:

        # generation prob
        for word, pos in sent:
            pos2words[pos][word] += 1

        # transition prob
        for bigram in as_bigram_tag(sent):
            trans[bigram] += 1

        # begin prob (BOS -> tag)
        bos[sent[0][1]] += 1

        # end prob (tag -> EOS)
        trans["_".join([sent[-1][1], 'EOS'])] += 1
        
    # save trarined data
    trained = dict()
    
    trained['pos2words'] = {k:dict(v) for k, v in pos2words.items()}
    trained['trans'] = dict(trans)
    trained['bos'] = dict(bos)
    
    with open(save_path, 'w') as f:
        json.dump(trained, f)
        
    return trained

In [10]:
trained = train(sj, 'data/trained_corpus_type1.json')

In [11]:
trained

{'pos2words': {'Noun': {'프랑스': 1,
   '세계적': 1,
   '의상': 1,
   '디자이너': 2,
   '엠마누엘': 2,
   '웅가로': 3,
   '실내': 2,
   '장식용': 1,
   '직물': 1,
   '침실': 1,
   '식당': 1,
   '욕실': 1,
   '사용': 1,
   '갖가지': 1,
   '직물제품': 1,
   '디자인': 4,
   '최근': 1,
   '파리': 1,
   '갤러리': 1,
   '라파예트백화점': 1,
   '색': 3,
   '컬렉션': 1,
   '이름': 1,
   '전시회': 1,
   '목욕가운': 1,
   '탁자보': 1,
   '냅킨': 1,
   '앞치마': 1,
   '작품들': 1,
   '것': 1,
   '조화': 2,
   '남미풍': 1,
   '강렬': 1,
   '원색끼리': 1,
   '수채화': 1,
   '안온': 1,
   '배색': 1,
   '등': 1,
   '분위기': 1,
   '강조': 1,
   '기하학적': 1,
   '무늬': 1,
   '꽃무늬': 1,
   '주류': 1,
   '장식품': 1,
   '때': 2,
   '옷': 2,
   '해방감': 1,
   '말': 1,
   '집': 1,
   '창작': 1,
   '원천': 1,
   '공간': 1,
   '미학': 1,
   '중요시': 1},
  'Josa': {'의': 8,
   '가': 2,
   '로': 1,
   '는': 3,
   '과': 1,
   '에서': 3,
   '을': 5,
   '으로': 1,
   '를': 3,
   '부터': 1,
   '까지': 1,
   '은': 2,
   '다': 1,
   '이': 2,
   '와는': 1},
  'Adjective': {'인': 1, '이라는': 2, '한': 2, '다르': 1, '못지않': 1},
  'Verb': {'나서': 1,
   '하는': 2,
   '해': 2,
   '열