In [3]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from util_module import Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

## Dataset
---
텍스트 코퍼스를 읽고 훈련 및 테스트 데이터셋으로 나눔. 데이터는 브라운 코퍼스의 복사본.

util_module.py의 Dataset 클래스는 코퍼스 읽어오고 구문 분석 실행. 데이터 집합은 단어 및 해당 태그 모음으로 plaintext에 저장됨. 각 문장은 첫 번째 줄에서 고유 식별자로 시작하고, 그 뒤에 탭으로 구분된 단어/태그 쌍이 다음 줄에서 하나씩 나타남. 문장은 빈 줄로 구분.

예시 
```
b100-38532
Perhaps	ADV
it	PRON
was	VERB
right	ADJ
;	.
;	.

b100-35577
...
```

In [4]:
data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8)

print("코퍼스에 {} 개의 문장이 있음".format(len(data)))
print("훈련셋에 {} 개의 문장이 있음".format(len(data.training_set)))
print("검증셋에 {} 개의 문장이 있음+".format(len(data.testing_set)))

코퍼스에 57340 개의 문장이 있음
훈련셋에 45872 개의 문장이 있음
검증셋에 11468 개의 문장이 있음+


In [5]:
key = 'b100-38532'
print("문장: {}".format(key))
print("단어연쇄:\n\t{!s}".format(data.sentences[key].words))
print("태그:\n\t{!s}".format(data.sentences[key].tags))

문장: b100-38532
단어연쇄:
	('Perhaps', 'it', 'was', 'right', ';', ';')
태그:
	('ADV', 'PRON', 'VERB', 'ADJ', '.', '.')


In [6]:
for i in range(5, 7):    
    print("Sent {}:".format(i + 1), data.X[i])
    print()
    print("Labels {}:".format(i + 1), data.Y[i])
    print()

Sent 6: ('After', 'television', ',', '``', 'La', 'Dolce', 'Vita', "''", 'seems', 'as', 'harmless', 'as', 'a', 'Gray', 'Line', 'tour', 'of', 'North', 'Beach', 'at', 'night', '.')

Labels 6: ('ADP', 'NOUN', '.', '.', 'X', 'X', 'X', '.', 'VERB', 'ADV', 'ADJ', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', '.')

Sent 7: ('It', 'would', 'give', 'him', 'an', 'opportunity', 'to', 'take', 'the', 'measure', 'of', 'his', 'chief', 'adversary', 'in', 'the', 'cold', 'war', ',', 'to', 'try', 'to', 'probe', 'Mr.', "Khrushchev's", 'intentions', 'and', 'to', 'make', 'clear', 'his', 'own', 'views', '.')

Labels 7: ('PRON', 'VERB', 'VERB', 'PRON', 'DET', 'NOUN', 'PRT', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', '.', 'PRT', 'VERB', 'PRT', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'CONJ', 'PRT', 'VERB', 'ADJ', 'DET', 'ADJ', 'NOUN', '.')



In [7]:
for i, tup in enumerate(data.stream()):
    print(tup)
    if i > 4: break

('Mr.', 'NOUN')
('Podger', 'NOUN')
('had', 'VERB')
('thanked', 'VERB')
('him', 'PRON')
('gravely', 'ADV')


## 필요한 함수 정의

In [20]:
def pair_counts(A, B):

    dic = {}
    for tag, word in zip(A, B):
        if tag not in dic:
            dic[tag] = {}
        if word not in dic[tag]:
            dic[tag][word] = 0
        dic[tag][word] += 1
    return dic

# pair_count는 tag를 먼저 받음
word_tag = list(zip(*data.training_set.stream()))
emission_counts = pair_counts(*word_tag[::-1])

In [45]:
def replace_unknown(sequence):
    return [w if w in data.training_set.vocab else 'nan' for w in sequence]

def simplify_decoding(X, model):
    _, state_path = model.viterbi(replace_unknown(X))
    return [state[1].name for state in state_path[1:-1]]

In [49]:
def accuracy(X, Y, model):
    correct = total_predictions = 0
    for observations, actual_tags in zip(X, Y):
        try:
            most_likely_tags = simplify_decoding(observations, model)
            correct += sum(p == t for p, t in zip(most_likely_tags, actual_tags))
        except:
            pass
        total_predictions += len(observations)
    return correct / total_predictions

## HMM tagger


$$t_i^n = \underset{t_i^n}{\mathrm{argmax}} \prod_{i=1}^n P(w_i|t_i) P(t_i|t_{i-1})$$

### 유니그램 카운트


$$P(tag_1) = \frac{C(tag_1)}{N}$$

In [9]:
def unigram_counts(sequences):
    return Counter(list(sequences)[1])

tag_unigrams = unigram_counts(zip(*data.training_set.stream()))

In [10]:
tag_unigrams

Counter({'ADV': 44877,
         'NOUN': 220632,
         '.': 117757,
         'VERB': 146161,
         'ADP': 115808,
         'ADJ': 66754,
         'CONJ': 30537,
         'DET': 109671,
         'PRT': 23906,
         'NUM': 11878,
         'PRON': 39383,
         'X': 1094})

### 바이그램 카운트

$$P(tag_2|tag_1) = \frac{C(tag_2|tag_1)}{C(tag_2)}$$


In [11]:
def bigram_counts(sequences):
    d = {}
    for seq in sequences:
        for i in range(0, len(seq)-1):
            if (seq[i], seq[i+1]) not in d.keys(): 
                d[(seq[i], seq[i+1])] = 0 
            d[(seq[i], seq[i+1])] +=1
    return d

tag_bigrams = bigram_counts(data.training_set.Y)

### 스타트 태그 세기

In [13]:
def starting_counts(sequences):
    d = {}
    for seq in sequences:
        if seq[0] not in d.keys(): 
            d[seq[0]] = 0 
        d[seq[0]] +=1
    return d

tag_starts = starting_counts(data.training_set.Y)

### 마지막 태그 세기

In [15]:
def ending_counts(sequences):
    d = {}  
    for seq in sequences:
        n = len(seq) - 1
        if seq[n] not in d.keys(): d[seq[n]] = 0 
        d[seq[n]] +=1

    return d

tag_ends = ending_counts(data.training_set.Y)

### HMM 태거

- 태그당 하나의 상태
    - 각 상태별 방출확률분포: $P(w|t) = \frac{C(t, w)}{C(t)}$
- 초기상태값에 edge더해주기 (다른 상태로)
    - 천이확률: $P(t|start) = \frac{C(start, t)}{C(start)}$
- 태그(상태)에서 마지막 태그로
    - 이 때의 천이확률: $P(end|t) = \frac{C(t, end)}{C(t)}$
- 각 태그별 천이확률
    - $P(t_2|t_1) = \frac{C(t_1, t_2)}{C(t_1)}$

In [36]:
model = HiddenMarkovModel()

In [37]:
s = {}
states = []
for tag in emission_counts.keys():
    tag_count = tag_unigrams[tag]
    p = {}
    for word in emission_counts[tag]:
        p[word] = emission_counts[tag][word]/tag_count
    emission_p = DiscreteDistribution(p)
    state = State(emission_p,name=""+tag)
    model.add_state(state)
    s[tag] = state 

In [40]:
for tag in tag_starts:
    model.add_transition(model.start, s[tag], tag_starts[tag]/len(data.training_set.Y))
    model.add_transition(s[tag], model.end, tag_ends[tag]/tag_unigrams[tag])
    
for (tag1, tag2) in tag_bigrams:
    model.add_transition(s[tag1], s[tag2], tag_bigrams[(tag1,tag2)]/tag_unigrams[tag1])

model.bake()

In [50]:
hmm_training_acc = accuracy(data.training_set.X, data.training_set.Y, model)
print("training accuracy basic hmm model: {:.2f}%".format(100 * hmm_training_acc))

hmm_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, model)
print("testing accuracy basic hmm model: {:.2f}%".format(100 * hmm_testing_acc))

training accuracy basic hmm model: 97.54%
testing accuracy basic hmm model: 95.95%


### Decoding

In [51]:
for key in data.testing_set.keys[10:13]:
    print("Sent key: {}\n".format(key))
    print("예측:\n-----------------")
    print(simplify_decoding(data.sentences[key].words, model))
    print()
    print("실제:\n--------------")
    print(data.sentences[key].tags)
    print("\n")

Sent key: b100-56612

예측:
-----------------
['PRON', 'VERB', 'ADV', 'VERB', 'PRON', '.', 'ADP', 'PRON', 'VERB', 'VERB', 'PRT', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', '.']

실제:
--------------
('PRON', 'VERB', 'ADV', 'VERB', 'PRON', '.', 'ADP', 'PRON', 'VERB', 'VERB', 'PRT', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', '.')


Sent key: b100-47330

예측:
-----------------
['ADV', 'VERB', 'PRON', 'VERB', 'ADV', '.', '.', '.']

실제:
--------------
('ADV', 'VERB', 'PRON', 'VERB', 'ADV', '.', '.', '.')


Sent key: b100-36695

예측:
-----------------
['ADJ', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP', 'DET', 'ADJ', '.', 'DET', 'NOUN', 'VERB', 'PRT', '.', 'VERB', 'DET', 'NUM', 'NOUN', 'ADP', 'DET', 'VERB', 'NOUN', '.', 'CONJ', '.', 'VERB', 'DET', 'ADJ', 'NOUN', 'PRT', 'VERB', 'PRON', 'PRT', 'ADP', 'PRON', '.', '.', 'ADP', 'NOUN', 'NOUN', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP', 'ADV', 'VERB', '.', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'CONJ'