Author: Kunqi Jiang   
Created by Sep 23 2018; 2:13 AM 

In this notebook, I record the knowledge of hidden markov model(HMM), and apply to Part-Of-Speech Tagging using the Penn-Treebank dataset.  

The three main problems in HMM to solve:    

(1) compute the joint probability $p(O|\lambda)$ of observed sequence $O$ given parameter $\lambda=(A,B,\pi)$.    

(2) Given parameter $\lambda$ and sequence $O$, compute max likelihood sequence of hidden states $P(I|O,\lambda)$.

(3) Learning parameters $\lambda$ by EM algorithm maximizing $p(O|\lambda)$.    

### Toy Example setup

In [112]:
observations = [1,2,2,2,1]
states = ['h','m','l','n']
obs_set = [1,2]

# A
trans_prob = {'h':{'h':0.5,'m':0.2,'l':0.2,'n':0.1},
              'm':{'h':0.1,'m':0.5,'l':0.2,'n':0.2},
              'l':{'h':0.2,'m':0.3,'l':0.1,'n':0.4},
              'n':{'h':0.3,'m':0.1,'l':0.4,'n':0.2}}
# B
emm_prob = {'h':{1:0.7,2:0.3},
            'm':{1:0.6,2:0.4},
            'l':{1:0.1,2:0.9},
            'n':{1:0.5,2:0.5}}
# Pi
init_prob = {'h':0.4,'m':0.1,'l':0.3,'n':0.2}

### Forward
Start from the first state:   

$$\alpha(h_1) = \pi(h_1)p(v_1|h_1)$$

then move forward recursively by:

\begin{eqnarray}
\alpha(h_t) &=& p(h_t,v_{1:t}) \\
&=& \sum_{h_{t-1}}p(v_t,h_t,h_{t-1},v_{1:t-1}) \\
&=& \sum_{h_{t-1}}p(v_t|h_t,h_{t-1},v_{1:t-1})p(h_t|h_{t-1},v_{1:t-1}) p(h_{t-1},v_{1:t-1}) \\
&=& p(v_t|h_t) \sum_{h_{t-1}}p(h_t|h_{t-1})p(h_{t-1},v_{1:t-1}) \\
&=& p(v_t|h_t) \sum_{h_{t-1}}p(h_t|h_{t-1})\alpha(h_{t-1})
\end{eqnarray}

In [2]:
def forward(observations,trans_prob,emm_prob,init_prob,t,h_t):
    cur_alpha = {}
    first_obs = observations[0]
    states = init_prob.keys()
    for state in states:
        cur_alpha[state] = init_prob[state] * emm_prob[state][first_obs]
    pre_alpha = dict(cur_alpha)
    for obs in observations[1:t]:
        for state in states:
            cur_alpha[state] = emm_prob[state][obs] * \
                               sum((trans_prob[pre_st][state] * \
                               pre_alpha[pre_st]) 
                                 for pre_st in states)
        pre_alpha = dict(cur_alpha)
        
    return cur_alpha[h_t]

In [120]:
forward(observations,trans_prob,emm_prob,init_prob,5,'h')

0.010133999399999998

### Joint Likelihood ----- first problem
$p(v_{1:T}) = \sum_{h_T}p(h_T,v_{1:T}) = \sum_{h_T}\alpha(h_T)$

In [4]:
def likelihood(observations,trans_prob,emm_prob,init_prob,t):
    states = init_prob.keys()
    return sum(forward(observations,trans_prob,emm_prob,init_prob,t,h_t)
               for h_t in states)

In [5]:
likelihood(observations,trans_prob,emm_prob,init_prob,len(observations))

0.0288189951

### Backward

Start from the last state $$\beta(h_T) = 1$$

then move backward recursively by:

\begin{eqnarray}
\beta(h_t) &=& p(v_{t+1:T}|h_t) \\
&=& \sum_{h_{t+1}}p(v_{{t+2}:T},v_{t+1},h_{t+1}|h_{t}) \\
&=& \sum_{h_{t+1}}p(v_{t+2:T}|v_{t+1},h_{t+1},h_{t})p(v_{t+1}|h_{t+1},h_{t})p(h_{t+1}|h_{t}) \\
&=& \sum_{h_{t+1}}p(v_{t+2:T}|h_{t+1})p(v_{t+1}|h_{t+1})p(h_{t+1}|h_{t}) \\
&=& \sum_{h_{t+1}}p(v_{t+1}|h_{t+1})p(h_{t+1}|h_{t})\beta(h_{t+1})
\end{eqnarray}


In [6]:
def backward(observations,trans_prob,emm_prob,init_prob,t,h_t):
    cur_beta = {}
    states = init_prob.keys()
    for state in states:
        cur_beta[state] = 1
    pre_beta = dict(cur_beta)
    for obs in reversed(observations[t:]):
        for state in states:
            cur_beta[state] = sum(emm_prob[next_state][obs] \
                                 *trans_prob[state][next_state] \
                                 *pre_beta[next_state] 
                                  for next_state in states)
        pre_beta = dict(cur_beta)
    return cur_beta[h_t]

In [7]:
backward(observations,trans_prob,emm_prob,init_prob,2,'h')

0.11022500000000002

### Smoothing
\begin{eqnarray}
p(h_t,v_{1:T}) &=& p(v_{t+1:T},h_t,v_{1:t}) \\
&=& p(v_{t+1:T}|h_t,v_{1:t})p(h_t,v_{1:t}) \\
&=& p(h_1,v_{1:t}) p(v_{t+1:T}|h_t) \\
&=& \alpha(h_t) \beta(h_t) \\
\end{eqnarray}

$$\lambda(h_t) = p(h_t|v_{1:T}) = \frac{\alpha(h_t) \beta(h_t)}{p(v_{1:T})}$$


In [8]:
def smoothing(observations,trans_prob,emm_prob,init_prob,t,h_t):
    states = init_prob.keys()
    alphas = {}
    betas = {}
    for state in states: 
        alphas[state] = forward(observations,trans_prob,emm_prob,init_prob,t,state)
        betas[state] = backward(observations,trans_prob,emm_prob,init_prob,t,state)
    return alphas[h_t]*betas[h_t] / sum(alphas[state]*betas[state] for state in states)

In [9]:
smoothing(observations,trans_prob,emm_prob,init_prob,1,'m')

0.13557905771669326

### Pairwise smoothing
\begin{eqnarray}
p(h_t,h_{t+1},v_{1:T}) &=& p(v_{t+2:T},h_{t+1},v_{t+1},h_t,v_{1:t}) \\
&=&p(v_{t+2:T}|v_{t+1},h_{t+1},h_t,v_{1:t})p(v_{t+1}|h_{t+1},h_{t},v_{1:t})p(h_{t+1}|h_t,v_{1:t})p(h_t,v_{1:t}) \\
&=&p(v_{t+2:T}|h_{t+1})p(v_{t+1}|h_{t+1})p(h_{t+1}|h_t)p(h_t,v_{1:t}) \\
&=& \alpha(h_t) p(h_{t+1}|h_t) p(v_{t+1}|h_{t+1}) \beta(h_{t+1})
\end{eqnarray}

In [10]:
def pairwise_smooth(observations,trans_prob,emm_prob,init_prob,t,h_t,h_t_1):
    alpha = forward(observations,trans_prob,emm_prob,init_prob,t,h_t)
    beta = backward(observations,trans_prob,emm_prob,init_prob,t+1,h_t_1)
    likely = likelihood(observations,trans_prob,emm_prob,init_prob,len(observations))
    v_t_1 = observations[t] # note index
    return (alpha * trans_prob[h_t][h_t_1] * emm_prob[h_t_1][v_t_1] * beta) / likely

In [11]:
s = 0
for cur_state in states:
    for next_state in states:
        s += pairwise_smooth(observations,trans_prob,emm_prob,init_prob,2,cur_state,next_state)
print(s)

1.0


### Most Likely Joint State ----- Second Problem

### Brute-Force 
Time complexity: $O(n^T)$,
where $n$ is number of possible states and $T$ is the length of sequence. It is not tractable for long sequence.

In [12]:
def brute_maxProd(obs,tran_prob,emm_prob,states,init_prob):
    max_prod = {}
    for state_0 in states:
        p_0 = init_prob[state_0] * emm_prob[state_0][obs[0]]
        for state_1 in states:
            p_1 = tran_prob[state_0][state_1] * emm_prob[state_1][obs[1]]
            for state_2 in states:
                p_2 = tran_prob[state_1][state_2] * emm_prob[state_2][obs[2]] 
                for state_3 in states:
                    p_3 = tran_prob[state_2][state_3] * emm_prob[state_3][obs[3]]
                    for state_4 in states:
                        p_4 = tran_prob[state_3][state_4] * emm_prob[state_4][obs[4]]
                        prob = p_0 * p_1 * p_2 * p_3 * p_4
                        path = (state_0,state_1,state_2,state_3,state_4)
                        max_prod[path] = prob 

    max_path = max(max_prod,key = max_prod.get)
    return max_prod[max_path], max_path

In [13]:
import time
start = time.time()
print("brute max prod",brute_maxProd(observations,trans_prob,emm_prob,states,init_prob))
end = time.time()
print("brute max prod time",end-start)

brute max prod (0.0007257600000000002, ('h', 'l', 'n', 'l', 'n'))
brute max prod time 0.001928091049194336


#### Max-Product Algorithm
\begin{eqnarray}
\max_{h_{1:T}}P(I,0|\lambda) &=& \max_{h_{1:T}}\pi(i)\prod_{t=1}^TP(h_{t+1}|h_{t})P(o_t|h_t) \\
 &=& \max_{h_1}\pi(h_1) ... \max_{h_{T-1}}P(h_{T_1}|h_{T_2})P(o_{T-1}|h_{T-1})\underline{\max_{h_T}P(h_T|h_{T-1})P(o_T|h_T)} \\
 &=& \max_{h_1}\pi(h_1) ... \underline{\max_{h_{T-1}}P(h_{T_1}|h_{T_2})P(o_{T-1}|h_{T-1})\mu(h_{T-1})} \\
 &=& \max_{h_1}\pi(h_1) ... \mu(h_{T-2}) \\
\end{eqnarray}
Time complexity: $O(T*n^2)$

In [14]:
def backward_max_prod(observations, tran_prob, emm_prob, states, init_prob):
    paths = defaultdict(list)
    for i,obs in enumerate(reversed(observations[1:])):
        cur_max = {}
        for pre_state in states:
            if i == 0:
                cur_max[pre_state], max_state = max((tran_prob[pre_state][next_state]*
                                                     emm_prob[next_state][obs], next_state)
                                                        for next_state in states)
                paths[pre_state].append(max_state)

            else:
                cur_max[pre_state], max_state = max((tran_prob[pre_state][next_state]*
                                                    emm_prob[next_state][obs]*
                                                    pre_max[next_state],next_state)
                                                        for next_state in states)
                paths[pre_state] = back_path[max_state] + [max_state]
        pre_max = cur_max 
        back_path = dict(paths)
    
    # compute initial state
    maxProd, first_state = max((init_prob[s]*emm_prob[s][observations[0]]*pre_max[s],s) for s in states)
    paths[first_state].append(first_state)

    return maxProd, list(reversed(paths[first_state]))

In [15]:
from collections import defaultdict
start = time.time()
print("backward max prod",backward_max_prod(observations,trans_prob,emm_prob,states,init_prob))
end = time.time()
print("time cost",end-start)

backward max prod (0.0007257600000000002, ['h', 'l', 'n', 'l', 'n'])
time cost 0.0006361007690429688


In [16]:
# Dynamic programing (DP)
def forward_max_prod(observations,tran_prob,emm_prob,states,init_prob):
    paths = defaultdict(list)
    for i,obs in enumerate(observations[:-1]):
        cur_max = {}

        for next_state in states:
            if i == 0:
                cur_max[next_state], max_state = max((init_prob[cur_state] *
                                                      emm_prob[cur_state][obs] * 
                                                      tran_prob[cur_state][next_state], cur_state)
                                                        for cur_state in states)
                paths[next_state].append(max_state)

            else:
                cur_max[next_state], max_state = max((tran_prob[cur_state][next_state] *
                                                      emm_prob[cur_state][obs] *
                                                      pre_max[cur_state], cur_state)
                                                        for cur_state in states)

                paths[next_state] = pre_path[max_state] + [max_state]

        pre_max = cur_max
        pre_path = dict(paths) # copy the path 
    
    # compute last state
    max_likeli, max_state = max((emm_prob[last_state][observations[-1]]*pre_max[last_state],last_state)
                                    for last_state in states)
    paths[max_state].append(max_state)
    return max_likeli, paths[max_state]

In [122]:
start = time.time()
print("forward max prod ", forward_max_prod(observations,trans_prob,emm_prob,states,init_prob))
end = time.time()
print("forward max prod time ",end-start)

forward max prod  (0.0007257600000000001, ['h', 'l', 'n', 'l', 'n'])
forward max prod time  0.0009150505065917969


###  Baum-Welch algorithm ----- Third Problem 
#### Expectation Maximization(EM)
$\lambda = (\pi,A,B)$, the parameters we are trying to learn

Objective:   
$Q(\lambda,\lambda^{'}) = E_{I}[\log P(O,I|\lambda)|O,\lambda^{'}] = \sum_{I}\log P(O,I|\lambda) P(I|0,\lambda^{'})$

Taking gradients with respect to $\pi$, $A$ and $B$ to maximize Q function iteratively, we can get:

1, Initial Probability update:   
$\pi_i = \frac{\sum_{n=1}^{N}P(h_1^{n} = i|O,\lambda)}{N}$    

2, Transition probability update:    
$a_{ij} = \frac{\sum_{n=1}^{N}\sum_{t=1}^{T-1}P(h_t^{n}=i,h_{t+1}^{n}=j|O,\lambda)}{\sum_{n=1}^{N}\sum_{t=1}^{T-1}P(h_t^{n}=i|O,\lambda)}$

3, Emit probability update:   
$b_i(k) = \frac{\sum_{n=1}^{N}\sum_{t=1}^{T}P(h_t^{n} = i|O,\lambda)I(o_t^{n} = k)}{\sum_{n=1}^{N}\sum_{t=1}^TP(h_t^{n}=i|O,\lambda)}$

In [95]:
import copy, math
def em_learn(observations,trans_prob,emm_prob,init_prob,obs_set):
    
    n = len(observations) # number of sequence
    states = init_prob.keys()
    
    likely = 0
    for obs in observations:
        seq_len = len(obs)
        # joint likelihood of all sequences
        # Note: using negative log likelihood to avoid underflow
        likely += -math.log(likelihood(obs,trans_prob,
                                       emm_prob,init_prob,
                                       seq_len))
    
    # update initial probability
    #print('init')
    new_init_prob = copy.deepcopy(init_prob)
    for state in states:
        first_prob = sum(smoothing(obs,trans_prob,
                                   emm_prob,init_prob,1,state)
                                   for obs in observations)
        new_init_prob[state] = first_prob / n
        
    # update transition
    #print('trans')
    new_trans_prob = copy.deepcopy(trans_prob)
    for cur_state in states:
        smo_sum = sum(smoothing(obs,trans_prob,emm_prob,init_prob,t,
                                cur_state) for obs in observations
                                           for t in range(1,len(obs)))
        
        for next_state in states:
            pair_sum = sum(pairwise_smooth(obs,trans_prob,
                                           emm_prob,init_prob,
                                           t,cur_state,next_state) 
                                           for obs in observations
                                           for t in range(1,len(obs)))

            new_trans_prob[cur_state][next_state] = pair_sum / smo_sum
    
    # update emission
    #print('emission')
    new_emm_prob = copy.deepcopy(emm_prob)
    for state in states:
        smo_sum = sum(smoothing(obs,trans_prob,emm_prob,init_prob,
                        t,state) for obs in observations
                                 for t in range(1,len(obs)+1))
        for o in obs_set:
            obs_sum = sum(smoothing(obs,trans_prob,emm_prob,init_prob,
                            t,state) if obs[t-1] == o else 0 
                            for obs in observations
                            for t in range(1,len(obs)+1))
    
            new_emm_prob[state][o] = obs_sum / smo_sum
        
    new_likely = 0
    for obs in observations:
        seq_len = len(obs)
        new_likely += -math.log(likelihood(obs,new_trans_prob,
                                           new_emm_prob,
                                           new_init_prob,seq_len))
        
    return likely,new_likely,new_trans_prob,new_emm_prob, new_init_prob
    

In [136]:
observations = [[1,2,2,2,1],[2,1,1,2,1],[1,1,1,1,2]]

def hmm_train(threshold,observations,trans_prob,emm_prob,init_prob,obs_set):
    likely = 0
    for obs in observations:
        likely += -math.log(likelihood(obs,
                                       trans_prob,
                                       emm_prob,
                                       init_prob,
                                       len(obs)))
    new_likely = 0

    n = 1
    while(likely - new_likely >= threshold):
        print('step: {}'.format(n))
        likely,new_likely,trans_prob,emm_prob,init_prob = em_learn(
                                            observations,trans_prob,
                                            emm_prob, init_prob,obs_set)
        
        print("previous step log likelihood: {:.5f}".format(likely))
        print("current step log likelihood: {:.5f}".format(new_likely))
        n+=1
        #print("init ", init_prob)
        #print("transition ", trans_prob)
        #print("emit ", emm_prob)
    return trans_prob, emm_prob, init_prob

In [137]:
new_trans, new_emis, new_init = hmm_train(1e-6,observations,trans_prob,emm_prob,init_prob,obs_set)

step: 1
previous step log likelihood: 10.45997
current step log likelihood: 10.08853
step: 2
previous step log likelihood: 10.08853
current step log likelihood: 10.07064
step: 3
previous step log likelihood: 10.07064
current step log likelihood: 10.06017
step: 4
previous step log likelihood: 10.06017
current step log likelihood: 10.05320
step: 5
previous step log likelihood: 10.05320
current step log likelihood: 10.04787
step: 6
previous step log likelihood: 10.04787
current step log likelihood: 10.04326
step: 7
previous step log likelihood: 10.04326
current step log likelihood: 10.03884
step: 8
previous step log likelihood: 10.03884
current step log likelihood: 10.03431
step: 9
previous step log likelihood: 10.03431
current step log likelihood: 10.02950
step: 10
previous step log likelihood: 10.02950
current step log likelihood: 10.02426
step: 11
previous step log likelihood: 10.02426
current step log likelihood: 10.01847
step: 12
previous step log likelihood: 10.01847
current step lo

previous step log likelihood: 8.06850
current step log likelihood: 8.06848
step: 102
previous step log likelihood: 8.06848
current step log likelihood: 8.06847
step: 103
previous step log likelihood: 8.06847
current step log likelihood: 8.06846
step: 104
previous step log likelihood: 8.06846
current step log likelihood: 8.06845
step: 105
previous step log likelihood: 8.06845
current step log likelihood: 8.06844
step: 106
previous step log likelihood: 8.06844
current step log likelihood: 8.06843
step: 107
previous step log likelihood: 8.06843
current step log likelihood: 8.06843
step: 108
previous step log likelihood: 8.06843
current step log likelihood: 8.06842
step: 109
previous step log likelihood: 8.06842
current step log likelihood: 8.06842
step: 110
previous step log likelihood: 8.06842
current step log likelihood: 8.06841
step: 111
previous step log likelihood: 8.06841
current step log likelihood: 8.06841
step: 112
previous step log likelihood: 8.06841
current step log likelihood

In [108]:
print('origin init prob:')
print(init_prob)
print('learnt init prob:')
print(new_init)

origin init prob:
{'n': 0.2, 'h': 0.4, 'm': 0.1, 'l': 0.3}
learnt init prob:
{'h': 0.6666666666666666, 'n': 0.0, 'm': 0.0, 'l': 0.3333333333333333}


In [107]:
print('origin trans prob:')
print(trans_prob)
print('learnt trans prob:')
print(new_trans)

origin trans prob:
{'n': {'n': 0.2, 'h': 0.3, 'm': 0.1, 'l': 0.4}, 'h': {'n': 0.1, 'h': 0.5, 'm': 0.2, 'l': 0.2}, 'm': {'n': 0.2, 'h': 0.1, 'm': 0.5, 'l': 0.2}, 'l': {'n': 0.4, 'h': 0.2, 'm': 0.3, 'l': 0.1}}
learnt trans prob:
{'h': {'h': 0.5514183109981636, 'n': 0.055252808194144985, 'm': 0.3933288772937472, 'l': 3.5139441623745054e-09}, 'n': {'h': 7.885720427744287e-46, 'n': 6.259169588067811e-42, 'm': 5.478754060980203e-21, 'l': 1.0}, 'm': {'h': 3.046018480069598e-11, 'n': 0.9999999999695397, 'm': 4.077699007664078e-216, 'l': 3.529339693975087e-217}, 'l': {'h': 1.0, 'n': 6.486007001435362e-44, 'm': 5.116169013536259e-89, 'l': 1.1268166339588191e-85}}


In [110]:
print('origin emit prob:')
print(emm_prob)
print('learnt emit prob:')
print(new_emis)

origin emit prob:
{'n': {1: 0.5, 2: 0.5}, 'h': {1: 0.7, 2: 0.3}, 'm': {1: 0.6, 2: 0.4}, 'l': {1: 0.1, 2: 0.9}}
learnt emit prob:
{'h': {1: 1.0, 2: 1.4426237454521875e-28}, 'n': {1: 0.5119273653076248, 2: 0.48807263469237505}, 'm': {1: 2.4422515783041376e-07, 2: 0.9999997557748421}, 'l': {1: 3.653277514430187e-25, 2: 1.0}}


## Part-of-Speech tagging:
### Supervised:

In [65]:
import json 
# this file put in data directory
with open('penn-data.json') as data_file:    
    penn_data = json.load(data_file)

In [68]:
print('total tagged data: {}'.format(len(penn_data)))

total tagged data: 3914


In [69]:
words = []
tags = []
seqs = []
poss = []
test_poss = []
test_seqs = []
for i,p in enumerate(penn_data):
    
    sent = p[0].replace(',','').split()
    sent[-1] = sent[-1].replace('.','')
    assert len(sent) == len(p[1])
    if i < 3800:
        seqs.append(sent)
        poss.append(p[1])
        words += sent
        tags += p[1]
    else:
        test_poss.append(p[1])
        test_seqs.append(sent)


In [70]:
vocab = list(set(words))
tag_set = list(set(tags))
from collections import defaultdict
trans_stat = {}
emit_stat = {}
init_stat = defaultdict(int)
for tag in tag_set:
    trans_stat[tag] = defaultdict(int)
    emit_stat[tag] = defaultdict(int)
# record emissions 
for word, tag in zip(words,tags):
    emit_stat[tag][word] += 1
# smooth unseen word
for tag in tag_set:
    emit_stat[tag]['<unk>'] = 1
# record transitions
for pos in poss:
    # record first state
    init_stat[pos[0]] += 1
    for i in range(len(pos)-1):
        trans_stat[pos[i]][pos[i+1]] += 1

In [71]:
print('vocab size : {}'.format(len(vocab)))
print('state size : {}'.format(len(tag_set)))

vocab size : 13160
state size : 41


In [124]:
# Normalizing 
for key in trans_stat.keys():
    tag_tol = sum(trans_stat[key].values())
    word_tol = sum(emit_stat[key].values())
    for sub_key in trans_stat[key].keys():
        trans_stat[key][sub_key] /= tag_tol
    for sub_key in emit_stat[key].keys():
        emit_stat[key][sub_key] /= word_tol
        
init_tol = sum(init_stat.values())
for key in init_stat.keys():
    init_stat[key] /= init_tol

In [73]:
def replace_unseen(sents,vocab):
    replaced = []
    for sent in sents:
        replaced.append([word if word in vocab else '<unk>' for word in sent])
    return replaced

In [125]:
# Inference 
test_seqs = replace_unseen(test_seqs,vocab)
pred_tags = []
for test in test_seqs:
    pred_tags.append(backward_max_prod(test,trans_stat,emit_stat,tag_set,init_stat)[1])

In [75]:
def evaluate(preds, labels):
    corr = 0
    tol = sum(len(pred) for pred in preds)
    for pred, label in zip(preds, labels):
        assert len(pred) == len(label)
        corr += sum(p == l for p,l in zip(pred,label))
    return corr / tol

In [76]:
print('Accuracy : {}'.format(evaluate(pred_tags,test_poss)))

Accuracy : 0.8768057784911717


In [79]:
for i,(obs,pos) in enumerate(zip(test_seqs[:10],pred_tags[:10])):
    print('seq: {}'.format(i))
    print('pos: {}'.format(pos))
    print('obs: {}'.format(obs))

seq: 0
pos: ['RBS', 'JJ', 'NN', 'VBZ', 'VBN', 'IN', 'DT', 'JJ', 'NN']
obs: ['<unk>', 'actual', 'profit', 'is', 'compared', 'with', 'the', '<unk>', 'estimate']
seq: 1
pos: ['NNP', 'NNP', 'NNP', 'VBD', 'PRP', 'VBD', 'PRP$', 'JJS', 'CD', 'WP$', 'NN', 'IN', 'RB', 'VBN', 'IN', 'NNP', 'NNP', 'DT', 'NNP', 'NN', 'VBG', 'NN']
obs: ['First', 'Chicago', 'Corp.', 'said', 'it', 'completed', 'its', '<unk>', 'million', '<unk>', 'acquisition', 'of', 'closely', 'held', '<unk>', 'Financial', 'Corp.', 'another', 'Chicago', 'bank', 'holding', 'company']
seq: 2
pos: ['DT', 'NN', 'FW', '-RRB-', 'IN', 'DT', 'NNP', 'NNP', 'VBZ', 'VBG', 'JJ', 'NNS', 'IN', 'DT', 'NNP', 'NN', "''"]
obs: ['The', 'record', '<unk>', '<unk>', 'by', 'the', 'Soviet', 'Union', 'is', 'causing', 'serious', '<unk>', 'in', 'the', 'U.S.', 'grain', '<unk>']
seq: 3
pos: ['DT', 'JJ', 'NNS', 'VBP', 'RB', 'JJ', 'IN', 'PRP', 'VBP', 'VBG', 'TO', 'VB', 'RB', 'JJR', 'NNS', 'CC', 'VBG', 'TO', 'VB', 'DT', 'RB', 'JJR', 'NN', 'NN', 'TO', 'VB', 'IN', 'JJ

### Unsupervised:

In [98]:
vocab.append('<unk>')
#new_trans, new_emit, new_init = hmm_train(1e-6,seqs[:5],trans_stat,emit_stat,init_stat,vocab)