In [10]:
import numpy as np
import tqdm as tqdm

In [11]:
corpus = [
"今天 天气 真 不错 。", 
"麻辣肥牛 好吃 ！" ,
"我 喜欢 吃 好吃 的 ！",
]

corpusBMSE = []

In [12]:
def BMSEconvert(token):
    if len(token) == 1:
        return "S"
    else:
        return "B" +  "M" * (len(token) - 2)  + "E"

def corpus2BMSE(corpus):
    for sentence in corpus:
        word = sentence.split(" ")
        temp = ""
        for token in word:
            temp += BMSEconvert(token) + " "
        corpusBMSE.append(temp)

In [13]:
corpus2BMSE(corpus)    
corpusBMSE

['BE BE S BE S ', 'BMME BE S ', 'S BE S BE S S ']

In [36]:
class HMM:
    def __init__(self,corpusBMSE=corpusBMSE,corpus=corpus):
        self.corpusBMSE = corpusBMSE
        self.corpus = corpus
        self.init_matrix = np.zeros(4)
        self.transfer_matrix = np.zeros((4, 4))
        self.emit_matrix = {"B":{"total":0}, "M":{"total":0}, "S":{"total":0}, "E":{"total":0}}
        self.states_to_index = {"B":0, "M":1, "E":2, "S":3}

    def build_init_matrix(self,state):
        add2 = self.states_to_index[state[0]]
        self.init_matrix[add2] += 1

    def build_transfer_matrix(self,states):
        states = "".join(states)
        print(states)
        st1 = states[:-1]
        st2 = states[1:]
        for head,tail in zip(st1,st2):
            head = self.states_to_index[head]
            tail = self.states_to_index[tail]
            self.transfer_matrix[head,tail] +=1 

    def build_emit_matrix(self,states,sentence):
        states = "".join(states)
        sentence = "".join(sentence)
        for i in range(len(sentence)):
            BMSE = states[i]
            self.emit_matrix[BMSE]["total"] += 1
            if sentence[i] in self.emit_matrix[BMSE].keys():
                self.emit_matrix[BMSE][sentence[i]] +=1
            else:
                self.emit_matrix[BMSE][sentence[i]] = 1
        
    def normalize(self):
        self.init_matrix = self.init_matrix / np.sum(self.init_matrix)
        self.transfer_matrix = self.transfer_matrix / self.transfer_matrix.sum(axis=1, keepdims=True)
        for state in self.emit_matrix:
            for char in self.emit_matrix[state]:
                if char != 'total':
                    self.emit_matrix[state][char] /= self.emit_matrix[state]["total"]

        
        
    def train(self):
        for states, sentence in zip(self.corpusBMSE, self.corpus):
            states = states.split(" ")
            sentence = sentence.split(" ")
            self.build_init_matrix(states[0])
            self.build_transfer_matrix(states)
            self.build_emit_matrix(states,sentence)
        
        self.normalize()


In [37]:
hmm = HMM()
hmm.train()
print(hmm.init_matrix)
rounded_transfer_matrix = np.round(hmm.transfer_matrix, 3)
print(rounded_transfer_matrix)
print(hmm.emit_matrix)

BEBESBES
BMMEBES
SBESBESS
[0.66666667 0.         0.         0.33333333]
[[0.    0.143 0.857 0.   ]
 [0.    0.5   0.5   0.   ]
 [0.286 0.    0.    0.714]
 [0.75  0.    0.    0.25 ]]
{'B': {'total': 7, '今': 0.14285714285714285, '天': 0.14285714285714285, '不': 0.14285714285714285, '麻': 0.14285714285714285, '好': 0.2857142857142857, '喜': 0.14285714285714285}, 'M': {'total': 2, '辣': 0.5, '肥': 0.5}, 'S': {'total': 7, '真': 0.14285714285714285, '。': 0.14285714285714285, '！': 0.2857142857142857, '我': 0.14285714285714285, '吃': 0.14285714285714285, '的': 0.14285714285714285}, 'E': {'total': 7, '天': 0.14285714285714285, '气': 0.14285714285714285, '错': 0.14285714285714285, '牛': 0.14285714285714285, '吃': 0.2857142857142857, '欢': 0.14285714285714285}}
