# 正向最大匹配法

In [1]:
#向最大匹配
def read_word_dict(dic_path):
    """
    加载词典，获取词典最长词的长度
    :param dic_path:词典路径
    :return dictionary,maximum: 词典，最大长度
    """
    dictionary = set()
    maximum = 0
    #读取词典
    with open(dic_path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            dictionary.add(line)
            if len(line) > maximum:
                maximum = len(line)
    return dictionary,maximum

def FMM_cut(dic_path,text):
    """
    正向最大匹配法分词
    :param dic_path,text:词典路径,待分词文本
    :return result: 分词结果
    """
    dictionary, maximum = read_word_dict(dic_path)
    result = []
    index = len(text)
    head = 0
    while head < index:
        word = None
        for size in range(maximum, 0, -1):#最大匹配
            if index - head < size:#剩余长度小于词长度，减小词长度再匹配
                continue
            piece = text[head:head+size]#正向匹配
            if piece in dictionary:
                word = piece
                result.append(word)
                head += size
                break
            if size == 1:
                piece = text[head:head+size]
                word = piece
                result.append(word)
                head += size
        if word is None:
            index -= 1
    return result

def main():
    text = "南京市长江大桥"
    
    txt_cut = FMM_cut('./data/imm_dic.utf8',text)
    
    print(txt_cut)

main()

['南京市长', '江', '大桥']


# 逆向最大匹配法

In [2]:
#逆向最大匹配
def read_word_dict(dic_path):
    """
    加载词典，获取词典最长词的长度
    :param dic_path:词典路径
    :return dictionary,maximum: 词典，最大长度
    """
    dictionary = set()
    maximum = 0
    #读取词典
    with open(dic_path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            dictionary.add(line)
            if len(line) > maximum:
                maximum = len(line)
    return dictionary,maximum

def RMM_cut(dic_path,text):
    """
    逆向最大匹配法分词
    :param dic_path,text:词典路径,待分词文本
    :return result: 分词结果
    """
    dictionary, maximum = read_word_dict(dic_path)
    result = []
    index = len(text)
    while index > 0:
        word = None
        for size in range(maximum, 0, -1):#最大匹配
            if index - size < 0:
                continue
            piece = text[(index - size):index]#逆向匹配
            if piece in dictionary:
                word = piece
                result.append(word)
                index -= size
                break
        if word is None:
            index -= 1
    return result[::-1]

def main():
    text = "南京市长江大桥"
    
    txt_cut = RMM_cut('./data/imm_dic.utf8',text)
    
    print(txt_cut)

main()

['南京市', '长江大桥']


# 双向最大匹配法

In [3]:

def BMM_cut(dic_path,text):
    """
    双向最大匹配法分词
    :param dic_path,text:词典路径,待分词文本
    :return result: 分词结果
    """
    rmm_txt_cut = RMM_cut(dic_path,text)#逆向最大匹配分词
    fmm_txt_cut = FMM_cut(dic_path,text)#正向最大匹配分词

    if len(fmm_txt_cut) < len(rmm_txt_cut):#词数切分最少的座位结果
        return fmm_txt_cut
    else:
        return rmm_txt_cut
def main():
    text = "南京市长江大桥"
    txt_cut = BMM_cut('./data/imm_dic.utf8',text)
    print(txt_cut)
main()

['南京市', '长江大桥']
