In [33]:
import re
import os
import sys

def loadModel(fileName):
    with open(fileName, "rb") as f:
        if fileName.endswith(".py"):
            return eval(f.read()) #eval() 函数用来执行一个字符串表达式，并返回表达式的值
        elif fileName.endswith(".txt"):
            result = set()
            for line in f:
                result.add(line.strip().decode('utf-8'))
            return result

prob_start = loadModel("./fileNeeded/prob_start.py") #dict
prob_trans = loadModel("./fileNeeded/prob_trans.py") #dict
prob_emit = loadModel("./fileNeeded/prob_emit.py") #dict
near_char_tab = loadModel("./fileNeeded/near_char_tab.txt") #set
print(len(prob_emit['B']))


def __raw_seg(sentence):
    i,j =0,0
    while j<len(sentence)-1:
        if not ( sentence[j:j+2] in near_char_tab):
            yield sentence[i:j+1]
            i=j+1
        j+=1
    yield sentence[i:j+1]



def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}] #tabular
    path = {}
    for y in states: #init
        V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
        path[y] = [y]
    for t in range(1,len(obs)):
        V.append({})
        newpath = {}
        for y in states:
            (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states if V[t-1][y0]>0])
            V[t][y] =prob
            newpath[y] = path[state] + [y]
        path = newpath
    if emit_p['M'].get(obs[-1],0)> emit_p['S'].get(obs[-1],0):
        (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','M')])
    else:
        (prob, state) = max([(V[len(obs) - 1][y], y) for y in states])
    return (prob, path[state])


def __cut(sentence):
    prob, pos_list =  viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)# 通过viterbi算法求出隐藏状态序列
    begin, next = 0,0
    # print pos_list, sentence
    # 基于隐藏状态序列进行分词    
    for i,char in enumerate(sentence):
        pos = pos_list[i]
        if pos=='B':# 字所处的位置是开始位置
            begin = i
        elif pos=='E':# 字所处的位置是结束位置
            yield sentence[begin:i+1]# 这个子序列就是一个分词
            next = i+1
        elif pos=='S': # 单独成字
            yield char
            next = i+1
    if next<len(sentence): # 剩余的直接作为一个分词，返回
        yield sentence[next:]

def cut(sentence,find_new_word=False):
    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
    blocks = re_han.split(sentence)
    if find_new_word: 
        detail_seg = lambda x: (x,)
    else:
        detail_seg = __raw_seg
    for blk in blocks:
        if re_han.match(blk):
            for lb in detail_seg(blk):
                for word in __cut(lb):
                    yield word
        else:
            tmp = re_skip.split(blk)
            for x in tmp:
                if x!="":
                    yield x
sentence_list = [
"姚晨和老凌离婚了",
"他说的确实在理",
"长春市长春节讲话"
]

print( "-----------默认效果-----------\n")

for sentence in sentence_list:
    seg_list = cut(sentence)
    print("/ ".join(seg_list))

print("\n-----------打开新词发现功能后的效果-----------\n")

for sentence in sentence_list:
    seg_list = cut(sentence,find_new_word=True)
    print("/ ".join(seg_list))

5051
-----------默认效果-----------

姚/ 晨/ 和/ 老/ 凌/ 离婚/ 了
他/ 说/ 的/ 确实/ 在理
长春/ 市长/ 春节/ 讲话

-----------打开新词发现功能后的效果-----------

姚晨/ 和/ 老凌/ 离婚/ 了
他/ 说/ 的/ 确实/ 在理
长春/ 市长/ 春节/ 讲话
