# Imports

In [1]:
import pickle as pk
import numpy as np
from collections import Counter
import itertools

# First order HMM

In [2]:
with open("train10.pkl","rb") as file:
    train_10 = pk.load(file)

with open("test10.pkl","rb") as file:
    test_10 = pk.load(file)
    
with open("train20.pkl","rb") as file:
    train_20 = pk.load(file)

with open("test20.pkl","rb") as file:
    test_20 = pk.load(file)

In [3]:
print("len(train_10) = {}\nlen(test_10)  = {}\nlen(train_20) = {}\nlen(test_20)  = {}"\
      .format(len(train_10),len(test_10),len(train_20),len(test_20)))

len(train_10) = 29057
len(test_10)  = 1501
len(train_20) = 27184
len(test_20)  = 3374


In [4]:
class HMM:
    
    def __init__(self, train):
        
        flat = list(itertools.chain.from_iterable(train))
        
        self.state_list = np.unique([el[1] for el in flat])
        self.N = len(self.state_list) # state_list length, just to verify that all alphabet appear
        
        self.obs_list = np.unique([el[0] for el in flat])
        self.M = len(self.obs_list) # obs_list length, just to verify that all alphabet appear
        
        self.make_index(self.state_list, self.obs_list)
        
        self.init_state_proba = self.init_state_proba(train)
        
        self.trans_proba = self.trans_proba(train)
        
        self.emission_proba = self.emission_proba(train)
        
    def make_index(self, state_list, obs_list):
        
        self.state_index = {}
        self.obs_index = {}
        self.index_obs = {}
        
        for i, el in enumerate(state_list):
            self.state_index[el] = i
            
        for i, el in enumerate(obs_list):
            self.obs_index[el] = i
            self.index_obs[i] = el
            
    def init_state_proba(self, train):
        
        pi = np.zeros(self.N)
        
        for el in train:
            
            pi[self.state_index[el[0][1]]] += 1
            
        return pi/np.sum(pi)
    
    def trans_proba(self, train):
        
        A = np.zeros((self.N,self.N))
        
        for el in train:
            
            for i in range(len(el)-1):
                
                A[self.state_index[el[i][1]]][self.state_index[el[i+1][1]]] += 1
                    
        for i in range(A.shape[0]):
            A[i] /= np.sum(A[i])
                
        return A
    
    def emission_proba(self, train):
        
        B = np.zeros((self.N,self.M))
        
        flat = list(itertools.chain.from_iterable(train))
        
        count = Counter(flat)
        
        for x in self.state_list:
            for y in self.obs_list:
                B[self.state_index[x]][self.obs_index[y]] = count[(x,y)]
                
        for i in range(self.N):
            B[i] /= np.sum(B[i])
            
        return B

In [86]:
def Viterbi(hmm,obs):
    
    N = hmm.N
    
    A = hmm.trans_proba
    
    B = hmm.emission_proba
    
    pi = hmm.init_state_proba
    
    state_index = hmm.state_index
    
    obs_index = hmm.obs_index
    
    index_obs = hmm.index_obs
    
    most_prob_seq = -np.ones((N,len(obs)))
    
    most_prob_seq[:,0] = np.arange(N)
    
    mu = B[:,obs_index[obs[0]]]*pi
    
    for k,i in enumerate(obs[1:]):
        
        tmp = np.zeros(N)
        
        for j in hmm.state_list:
            
            vect = B[state_index[j],obs_index[i]]*A[:,state_index[j]]*mu
            
            tmp[state_index[j]] = np.max(vect)
            
            most_prob_seq[state_index[j]][k+1] = np.argmax(vect)
            
        mu = tmp
        
    seq = []
    
    seq.append(index_obs[np.argmax(mu)])
    
    for i in range(len(obs)-1,0,-1):
    
        seq.append(index_obs[most_prob_seq[obs_index[seq[len(obs) - 1 - i]],i]])
        
        
    return seq[::-1]

In [108]:
def eval(true,test,pred):
    
    pos = 0
    
    neg = 0
    
    l1 = list(itertools.chain.from_iterable(true))
    
    l2 = list(itertools.chain.from_iterable(test))
    
    l3 = list(itertools.chain.from_iterable(pred))
    
    for i in range(len(l1)):
        
        if(l1[i] != l2[i] and l1[i] == l3[i]):
            pos +=1
            
        if(l1[i] == l2[i] and l1[i] != l3[i]):
            neg +=1
            
    return pos,neg

In [109]:
def accuracy(true,pred):
    
    l1 = np.array(list(itertools.chain.from_iterable(true)))
        
    l2 = np.array(list(itertools.chain.from_iterable(pred)))
    
    return (l1 == l2).sum()/l1.shape[0]

## Testing with 10% error data

In [132]:
hmm = HMM(train_10)
test = []
true = []
for l in test_10:
    test.append([el[0] for el in l])
    true.append([el[1] for el in l])
    
pred = []

for el in test:
    pred.append(Viterbi(hmm,el))

In [133]:
pos,neg = eval(true,test,pred)

print("First order HMM made {} mistakes and corrected {} mistakes".format(neg,pos))

First order HMM made 79 mistakes and corrected 247 mistakes


In [134]:
print("The overall accuracy is about {0:.2f}%".format(accuracy(true,pred)*100))

The overall accuracy is about 92.12%


In [135]:
print("Without doing anything, the accuracy is about {0:.2f}%".format(accuracy(true,test)*100))

Without doing anything, the accuracy is about 89.82%


## Testing with 20% error data

In [136]:
hmm = HMM(train_20)
test = []
true = []
for l in test_20:
    test.append([el[0] for el in l])
    true.append([el[1] for el in l])
    
pred = []

for el in test:
    pred.append(Viterbi(hmm,el))

In [137]:
pos,neg = eval(true,test,pred)

print("First order HMM made {} mistakes and corrected {} mistakes".format(neg,pos))

First order HMM made 363 mistakes and corrected 1080 mistakes


In [138]:
print("The overall accuracy is about {0:.2f}%".format(accuracy(true,pred)*100))
print("Without doing anything, the accuracy is about {0:.2f}%".format(accuracy(true,test)*100))

The overall accuracy is about 84.89%
Without doing anything, the accuracy is about 80.59%
