In [1]:
from math import log

## 前缀字典

In [2]:
def gen_pfdict(f_name):
    '''
    生成前缀字典。
    :param f_name:词典路径
    :return lfreq, ltotal: {词：词频},总词频
    '''
    lfreq = {} #字典存储{词：词频}
    ltotal = 0 #所有词总的出现次数
    with open(f_name, 'rb') as f: # 打开文件词表f_name
        for lineno, line in enumerate(f, 1): # 行号,行
            try:
                line = line.strip().decode('utf-8') # 解码为Unicode
                word, freq = line.split(' ')[:2] # 获得词及其词频
                freq = int(freq)#词频转成int型
                lfreq[word] = freq#生成字典{词：词频}
                ltotal += freq#统计所有词的词频
                for ch in range(len(word)):# word的前缀词处理
                    wfrag = word[:ch + 1]#前缀词
                    if wfrag not in lfreq: # word前缀不在lfreq则其出现频次置0 
                        lfreq[wfrag] = 0
            except ValueError:
                raise ValueError('invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
    return lfreq, ltotal

In [3]:
lfreq, ltotal = gen_pfdict('./data/dict.txt')

## 有向无环图

In [4]:
def get_DAG(sentence):
    '''
    生成有向无环图。
    :param sentence:待分词句子
    :return DAG: 有向无环图{key:list,...},key:词的起始位置，list:从起始位置开始到以后位置可成词的位置。
    '''
    DAG = {}
    N = len(sentence)
    for k in range(N):
        tmplist = []
        i = k
        frag = sentence[k]
        while i < N and frag in lfreq:
            if lfreq[frag]:#字典力有的才会添加到有向无环图里
                tmplist.append(i)
            i += 1
            frag = sentence[k:i + 1]
        if not tmplist:#字典里未登录的词，以孤立词添加到图里
            tmplist.append(k)
        DAG[k] = tmplist
    return DAG

In [5]:
#sentence = "去北京大学玩"
sentence = "南京市长江大桥"
DAG = get_DAG(sentence)

In [6]:
DAG

{0: [0, 1, 2], 1: [1, 2], 2: [2, 3], 3: [3, 4, 6], 4: [4], 5: [5, 6], 6: [6]}

## 动态规划计算路径最大概率

In [7]:
def calc(sentence, DAG, route):
    '''
    使用动态规划方法，从后往前遍历，选择一个频度得分最大的一个切分组合。 
    :param sentence,DAG:待分词句子,有向无环图
    :return route: 分词的概率和路径
    '''
    N = len(sentence)
    route[N] = (0, 0)
     # 对概率值取对数之后的结果(可以让概率相乘的计算变成对数相加,防止相乘造成下溢)
    logtotal = log(ltotal)
    # 从后往前遍历句子 反向计算最大概率
    for idx in range(N - 1, -1, -1):
        # 列表推倒求最大概率对数路径
        #log(self.FREQ.get(sentence[idx:x + 1]) or 1) - logtotal：sentence[idx:x + 1]的词频
        # route[x+1][0] 表示 词路径[x+1,N-1]的最大概率对数
        route[idx] = max((log(lfreq.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[idx])

In [8]:
route = {}
calc(sentence, DAG, route)

In [9]:
route

{7: (0, 0),
 6: (-8.863849339256593, 6),
 5: (-9.813518371579148, 6),
 4: (-19.011818225013663, 4),
 3: (-9.653648934289546, 6),
 2: (-16.96504852719957, 2),
 1: (-26.084355807486915, 1),
 0: (-19.941560115533193, 2)}

## 根据路径分词

In [10]:
def __cut_DAG_NO_HMM(sentence):
    '''
    根据分词路径分词处理。 
    :param sentence:待分词句子
    :return buf_list: 分词列表
    '''
    x = 0
    N = len(sentence)
    buf_list = []
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        buf_list.append(l_word)
        x = y
    if buf_list:
        return buf_list

In [11]:
__cut_DAG_NO_HMM(sentence)

['南京市', '长江大桥']