In [1]:
# -*- coding: utf-8 -*-
import json
from bigram import Bigram

class HMM:
    def __init__(self):
        self.load_param()
        self.bigram = Bigram()
    
    def load_param(self):
        self.init_prob = self.read('init_prob')
        self.emiss_prob = self.read('emiss_prob')
        self.trans_prob = self.read('trans_prob')
        self.pinyin_states = self.read('pinyin_states')
        
    def read(self, filename):
        with open('model_params/' + filename + '.json', 'r') as f:
            return json.load(f)
        
    # Viterbi process
    def trans(self, strs):

        # 切分
        seq = self.bigram.dp_search(strs)
        
        # smooth
        self.min_f = -3.14e+100
        length = len(seq)
        
        viterbi = {}
        for i in range(length):
            viterbi[i] = {}
        
        # initize
        for s in self.pinyin_states.get(seq[0]):
            viterbi[0][s] = (self.init_prob.get(s, self.min_f) + 
                   self.emiss_prob.get(s, {}).get(seq[0], self.min_f) + 
                   self.trans_prob.get(s, {}).get('BOS', self.min_f), -1)
        
        # DP 
        # look trans_prob = {post1:{pre1:p1, pre2:p2}, post2:{pre1:p1, pre2:p2}}
        for i in range(length - 1):
            for s in self.pinyin_states.get(seq[i+1]):
                viterbi[i + 1][s] = max([ ( viterbi[i][pre][0] + self.emiss_prob.get(s, {}).get(seq[i+1], self.min_f)
                + self.trans_prob.get(s, {}).get(pre, self.min_f) ,pre) for pre in self.pinyin_states.get(seq[i])])
        
        for s in self.pinyin_states.get(seq[-1]):
            viterbi[length-1][s] = (viterbi[length-1][s][0] + self.trans_prob.get('EOS', {}).get(s, self.min_f),
                   viterbi[length-1][s][1] )
    
    
        words = [None] * length

        words[-1] = max(viterbi[length - 1], key=viterbi[length - 1].get)
        
        for n in range(length-2, -1, -1):
            words[n] = viterbi[n+1][words[n+1]][1]
        
        return ''.join(w for w in words)

'''
laisongtulomalasongganxietigongodadadetexieo
zheweirenxionghaoshuanglo
zhengaaaaaaaaaaaaaagandidechixianwoxiangzhidaobiangetongjudaduolojurangandougoudandafenxiangxianzaidehouzhendeshiwuyaokejiu
'''
'''
HMM只考虑上一个字
jiaohuaqiao
xidazhijie
'''
hmm = HMM()
print(hmm.trans('zhongwenxinxichuli'))

中文信息处理


In [2]:
print(hmm.trans('haerbingongyedaxuejisuanjikexueyujishuxueyuanheruanjianxueyuan'))

哈尔滨工业大学计算机科学与技术学院和软件学院


In [3]:
print(hmm.trans('hagongdadierjiebingdiaodasaijiangzaishieryueershisirijuxing'))

哈工大第二届冰雕大赛将在十二月二十四日举行


In [4]:
print(hmm.trans('zhongmeimaoyizhanjiangyingxiangzhongmeiguanxihequanshijiedejingjigeju'))

中美贸易战将影响中美关系和全世界的经济格局


In [6]:
print(hmm.trans('maerkefumoxing'))

马尔科夫模型


In [7]:
print(hmm.trans('caoxueqin'))

曹雪芹


In [8]:
print(hmm.trans('baiduhealidoushihulianwanggongsi'))

百度和阿里都是互联网公司


In [9]:
print(hmm.trans('jinzhaoyoujiujinzhaozui'))

今朝有酒今朝醉


In [10]:
print(hmm.trans('gongyuansanliuyisinian'))

公元三六一四年


In [11]:
print(hmm.trans('laiwushijinsuidajiudian'))

莱芜市金穗大酒店


In [15]:
print(hmm.trans('yingxionglianmenghewangzherongyaofengmiquanqiu'))

英雄联盟和王者荣耀风靡全球


In [13]:
print(hmm.trans('bazhonghuayouxiuchuantongwenhuazuoweiziyangdangdaizhongguorenjingshendeyuanquanhewotu'))

把中华优秀传统文化作为滋养当代中国人精神的源泉和沃土


In [14]:
print(hmm.trans('yixijinpingtongzhiweihexindedangzhongyang'))

以习近平同志为核心的党中央


In [16]:
print(hmm.trans('lingdianwushisanfenliumiao'))

零点五十三分六秒


In [17]:
print(hmm.trans('zhongzhaimiaozuyizubuyizuxiang'))

中寨苗族彝族布依族乡


In [18]:
print(hmm.trans('cankujiaodoushidelifu'))

残酷角斗士的利斧


In [19]:
print(hmm.trans('chimeiwangliang'))

魑魅魍魉


In [20]:
print(hmm.trans('xianjiaotongdaxue'))

县交通大学


In [21]:
print(hmm.trans('yinmaerkefumoxing'))

饮马尔科夫模型


In [22]:
print(hmm.trans('baiduhealidoushiyoumindehulianwanggongsi'))

百度和阿里都是由民的互联网公司


In [23]:
print(hmm.trans('baidualitengxungugeweiruanyamaxun'))

白杜阿里腾讯股歌微软亚马逊


In [24]:
print(hmm.trans('shanggeshijisanshiniandaidejingrongweijizaojiulexinxijishudepengbofazhan'))

上个世纪三十年代的经融危及早就了信息技术的蓬勃发展


In [25]:
print(hmm.trans('zhifubaosaohongbaozhuanshangjin'))

支付宝扫红包赚上进


In [26]:
print(hmm.trans('xiangquanguoguangdakejigongzuozhezhiyichonggaojingyihechengzhidewenhou'))

向全国广大科技工作者之以崇高精益和诚挚的问候


In [27]:
print(hmm.trans('guanxianlinsuanhuanyuanmei'))

管线磷酸还原酶


In [28]:
print(hmm.trans('xidazhijiejiushierhao'))

西大直接就是二号


In [29]:
print(hmm.trans('daxingrengongshenjinwangluogaosuluojidianlu'))

大型人工神进网络高素逻辑电路


In [30]:
print(hmm.trans('cankujiaodoushidelifupixiangleta'))

残酷角斗士的理府批响了他


In [31]:
print(hmm.trans('aozhoufupiaohuyu'))

澳洲腹瓢壶玉


In [32]:
print(hmm.trans('pinyinhanzizhuanhuanceshi'))

拼音汉字转换策时
