In [1]:

import sys
import pinyin
import jieba
import string
import re

In [2]:
FILE_PATH = "./token_freq_pos%40350k_jieba.txt"
PUNCTUATION_LIST = string.punctuation
PUNCTUATION_LIST += "。，？：；｛｝［］‘“”《》／！％……（）"


In [3]:
#读取jieba词表
def construct_dict( file_path ):

    word_freq = {}
    with open(file_path, "r",encoding='UTF-8') as f:
        for line in f:
            info = line.split()
            word = info[0]
            frequency = info[1]
            word_freq[word] = frequency

    return word_freq

In [4]:
phrase_freq = construct_dict( FILE_PATH )


In [5]:
def load_cn_words_dict( file_path ):
    cn_words_dict = ""
    with open(file_path, "r",encoding='UTF-8') as f:
        for word in f:
            cn_words_dict += word.strip()
    return cn_words_dict

In [6]:
cn_words_dict = load_cn_words_dict( "./cn_dict.txt" )


In [7]:
def edits1(phrase, cn_words_dict):
    "All edits that are one edit away from `phrase`."
    phrase = phrase
    splits     = [(phrase[:i], phrase[i:])  for i in range(len(phrase) + 1)]
    deletes    = [L + R[1:]                 for L, R in splits if R]
    #print("deletes:",deletes)
    transposes = [L + R[1] + R[0] + R[2:]   for L, R in splits if len(R)>1]
    #print("transposes:",transposes)
    replaces   = [L + c + R[1:]             for L, R in splits if R for c in cn_words_dict]
    #print("replaces:",replaces)
    inserts    = [L + c + R                 for L, R in splits for c in cn_words_dict]
    #print("inserts:",inserts)
    return set(deletes + transposes + replaces + inserts)

In [8]:
#edits1("我不好你", "你")

In [9]:

def known(phrases): 
    return set(phrase for phrase in phrases if phrase in phrase_freq)

In [10]:

def get_candidates( error_phrase ):

    candidates_1st_order = []
    candidates_2nd_order = []
    candidates_3nd_order = []
    #print(error_phrase)
    error_pinyin = pinyin.get(error_phrase, format="strip", delimiter="/")
    #print(error_pinyin)
    cn_words_dict = load_cn_words_dict( "./cn_dict.txt" )
    candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) )
    #print(candidate_phrases)
    for candidate_phrase in candidate_phrases:
        candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/")
        if candidate_pinyin == error_pinyin:
            candidates_1st_order.append(candidate_phrase)
        elif candidate_pinyin.split("/")[0] == error_pinyin.split("/")[0]:
            candidates_2nd_order.append(candidate_phrase)
        else:
            candidates_3nd_order.append(candidate_phrase)

    return candidates_1st_order, candidates_2nd_order, candidates_3nd_order

In [11]:
def auto_correct( error_phrase ):

    c1_order, c2_order, c3_order = get_candidates(error_phrase)
    # print c1_order, c2_order, c3_order
    if c1_order:
        return max(c1_order, key=phrase_freq.get )
    elif c2_order:
        return max(c2_order, key=phrase_freq.get )
    else:
        return max(c3_order, key=phrase_freq.get )

In [12]:
def auto_correct_sentence( error_sentence, verbose=True):

    jieba_cut = jieba.cut( error_sentence, cut_all=False)
    #print(list(jieba_cut))
    seg_list = "\t".join(jieba_cut).split("\t")
    print(seg_list)
    correct_sentence = ""

    for phrase in seg_list:

        correct_phrase = phrase
        # check if item is a punctuation
        if phrase not in PUNCTUATION_LIST:
            # check if the phrase in our dict, if not then it is a misspelled phrase
            if phrase not in phrase_freq.keys():
                correct_phrase = auto_correct(phrase)
                if verbose :
                    print (phrase, correct_phrase)

        correct_sentence += correct_phrase

    return correct_sentence

In [13]:
def main():

    err_sent_1 = '机七学习是人工智能领遇最能体现智能的一个分知！'
    print ("Test case 1:")
    correct_sent = auto_correct_sentence( err_sent_1 )
    print ("original sentence:" + err_sent_1 + "\n==>\n" + "corrected sentence:" + correct_sent)

    err_sent_2 = '杭洲是中国的八大古都之一，因风景锈丽，享有"人间天棠"的美誉！'
    print ("Test case 2:")
    correct_sent = auto_correct_sentence( err_sent_2 )
    print ("original sentence:" + err_sent_2 + "\n==>\n" + "corrected sentence:" + correct_sent)

if __name__=="__main__":
    #reload(sys)
    #sys.setdefaultencoding('utf-8')
    main()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86153\AppData\Local\Temp\jieba.cache


Test case 1:


Loading model cost 0.646 seconds.
Prefix dict has been built succesfully.


['机七', '学习', '是', '人工智能', '领遇', '最能体现', '智能', '的', '一个', '分知', '！']
机七 机器
领遇 领域
分知 分治
original sentence:机七学习是人工智能领遇最能体现智能的一个分知！
==>
corrected sentence:机器学习是人工智能领域最能体现智能的一个分治！
Test case 2:
['杭洲', '是', '中国', '的', '八大', '古都', '之一', '，', '因', '风景', '锈丽', '，', '享有', '"', '人间', '天棠', '"', '的', '美誉', '！']
杭洲 杭州
锈丽 秀丽
天棠 天堂
original sentence:杭洲是中国的八大古都之一，因风景锈丽，享有"人间天棠"的美誉！
==>
corrected sentence:杭州是中国的八大古都之一，因风景秀丽，享有"人间天堂"的美誉！
