In [1]:
from collections import Counter
import numpy as np
import re,os
import glob
import six
import codecs

In [6]:
# 功能：停用词加载
def get_stop_word(stop_word_path):
    #停用词列表，默认使用哈工大停用词表
    f = open(stop_word_path,encoding='utf-8')
    stop_words = list()
    for stop_word in f.readlines():
        stop_words.append(stop_word[:-1])
    return stop_words

class NewWordFind():
    def __init__(self,min_freq=2, n_gram=5, min_p=2 , min_entropy=1, max_score=100, min_score=2):
        '''
            input:
                n_gram: int         n_gram 的 粒度 
                min_p: int          最小 信息熵 阈值 
                min_entropy: int          左右熵 阈值 
                max_score: int          综合得分最大阈值
                min_score: int          综合得分最小阈值
        '''
        self.n_gram = n_gram
        self.min_p = min_p
        self.min_entropy = min_entropy
        self.max_score = max_score
        self.min_score = min_score
        self.min_freq = min_freq
    
    # 功能：将 text 进行 n_gram 
    def n_gram_words(self,text):
        """
            功能：将 text 进行 n_gram 
            input:
                text : String       输入句子 
            return：
                words_freq：Dict    词频 字典
        """
        words = []
        for i in range(1,self.n_gram+1):
            words += [text[j:j+i] for j in range(len(text)-i+1)]
        words_freq = dict(Counter(words))    
        new_words_freq = {}
        for word,freq in words_freq.items():
            new_words_freq[word]=freq
        return new_words_freq      
    
    # 功能：PMI 过滤掉 噪声词  
    def PMI_filter(self, word_freq_dic):
        """
            功能：PMI 过滤掉 噪声词 
            input:
                words_freq：Dict    词频 字典
            return:
                new_words_dic:Dict  PMI 过滤噪声后 剩余新词 
        """
        new_words_dic = {}
        for word in word_freq_dic:
            if len(word) == 1:
                pass
            else:
                p_x_y = min([word_freq_dic.get(word[:i])* word_freq_dic.get(word[i:]) for i in range(1,len(word))])
                mpi = p_x_y/word_freq_dic.get(word)
                if mpi > self.min_p:
                    new_words_dic[word] = [mpi]
        return new_words_dic

    # 功能： 计算字符列表的熵
    def calculate_entropy(self, char_list):
        """
            功能： 计算字符列表的熵
            input： 
                char_list: List     字符列表 
            return:
                entropy: float       熵 
        """
        char_freq_dic =  dict(Counter(char_list)) 
        entropy = (-1)*sum([ char_freq_dic.get(i)/len(char_list)*np.log2(char_freq_dic.get(i)/len(char_list)) for i in char_freq_dic])
        return entropy
    
    # 功能：通过熵阈值从限定词字典中过滤出最终的新词
    def Entropy_left_right_filter(self,condinate_words_dic,text):
        """
            功能：通过熵阈值从限定词字典中过滤出最终的新词
            input： 
                condinate_words_dic：Dict       限定词字典     
                text：String                    句子 
            output： 
                final_words_list:List           最终的新词列表 
        """
        final_words_list = []
        for word in condinate_words_dic.keys():
            left_right_char =re.findall('(.)%s(.)'%word,text)

            left_char = [i[0] for i in left_right_char] 
            left_entropy = self.calculate_entropy(left_char)

            right_char = [i[1] for i in left_right_char]
            right_entropy = self.calculate_entropy(right_char)
            score = condinate_words_dic[word][0]-min(left_entropy,right_entropy)
            if min(right_entropy,left_entropy)> self.min_entropy and score<self.max_score and score>self.min_score:
                final_words_list.append({
                    "word":word,
                    "pmi":condinate_words_dic[word][0],
                    "left_entropy":left_entropy,
                    "right_entropy":right_entropy,
                    "score":score
                })
        final_words_list = sorted(final_words_list, key=lambda x: x['score'], reverse=True)
        return final_words_list

    
# 语料生成器，并且初步预处理语料
def text_generator(file_path):
    txts = glob.glob(f'{file_path}/*.txt')
    for txt in txts:
        d = codecs.open(txt, encoding='utf-8').read()
        title = d.split("\n")[0]
        d = d.replace(u'\u3000', '').strip()
        yield title,re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '', d)

In [9]:
# read the data and preprocessing the data to a whole str
stop_word= get_stop_word("resource/stopword.txt")

file_path = "data/"

min_freq = 2
n_gram = 5
min_p = 2
min_entropy = 1
max_score = 100
min_score = 2

new_word_find = NewWordFind(min_freq=min_freq, n_gram=n_gram, min_p=min_p , min_entropy=min_entropy, max_score=max_score, min_score=min_score)

for index,(title,text) in enumerate(text_generator(file_path)):
    print(f"\n index :{index} => title:{title}")
    for i in stop_word:
        text=text.replace(i,"")

    n_gram = new_word_find.n_gram_words(text)
    new_words_dic = new_word_find.PMI_filter(n_gram)
    new_words_list = new_word_find.Entropy_left_right_filter(new_words_dic,text)
    
    for new_words in new_words_list:
         print(f"{new_words}")

   


 index :0 => title:##习近平在第六届东方经济论坛全会开幕式上的致辞（全文）
{'word': '合作', 'pmi': 13.75, 'left_entropy': 3.0, 'right_entropy': 2.75, 'score': 11.0}
{'word': '中国', 'pmi': 12.0, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 10.415037499278844}
{'word': '中俄', 'pmi': 6.0, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 4.415037499278844}
{'word': '发展', 'pmi': 5.0, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 3.415037499278844}
{'word': '世界', 'pmi': 5.0, 'left_entropy': 2.321928094887362, 'right_entropy': 2.321928094887362, 'score': 2.678071905112638}

 index :1 => title:##王毅国务委员兼外长在东宁要塞博物馆纪念中国人民抗日战争暨世界反法西斯战争胜利76周年活动上的书面致辞
{'word': '两国', 'pmi': 21.0, 'left_entropy': 1.9219280948873623, 'right_entropy': 1.9219280948873623, 'score': 19.078071905112637}
{'word': '国际', 'pmi': 21.0, 'left_entropy': 2.75, 'right_entropy': 2.5, 'score': 18.5}
{'word': '和平', 'pmi': 19.5, 'left_entropy': 2.0, 'right_entropy': 2

{'word': '暴力', 'pmi': 98.66666666666667, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 97.08170416594551}
{'word': '新疆实', 'pmi': 96.0, 'left_entropy': 2.321928094887362, 'right_entropy': 1.3709505944546687, 'score': 94.62904940554533}
{'word': '政治', 'pmi': 97.14285714285714, 'left_entropy': 2.807354922057604, 'right_entropy': 2.807354922057604, 'score': 94.33550222079954}
{'word': '反恐', 'pmi': 96.0, 'left_entropy': 2.2516291673878226, 'right_entropy': 1.7924812503605778, 'score': 94.20751874963942}
{'word': '杀死', 'pmi': 88.66666666666667, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 87.08170416594551}
{'word': '犯下', 'pmi': 85.0, 'left_entropy': 2.321928094887362, 'right_entropy': 1.9219280948873623, 'score': 83.07807190511264}
{'word': '涉疆', 'pmi': 82.5, 'left_entropy': 3.321928094887362, 'right_entropy': 1.7709505944546688, 'score': 80.72904940554533}
{'word': '时期', 'pmi': 82.5, 'left_entropy': 2.0, 'right_entropy': 

{'word': '医生', 'pmi': 86.66666666666667, 'left_entropy': 2.584962500721156, 'right_entropy': 2.251629167387823, 'score': 84.41503749927885}
{'word': '晓晓', 'pmi': 76.0, 'left_entropy': 3.9057645846554525, 'right_entropy': 4.058813890331201, 'score': 72.09423541534454}
{'word': '腿骨', 'pmi': 54.0, 'left_entropy': 1.5, 'right_entropy': 2.0, 'score': 52.5}
{'word': '做手术', 'pmi': 49.0, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 47.415037499278846}
{'word': '手术', 'pmi': 50.13953488372093, 'left_entropy': 4.63578319633073, 'right_entropy': 4.844232898763193, 'score': 45.5037516873902}
{'word': '妈妈', 'pmi': 45.125, 'left_entropy': 1.2987949406953985, 'right_entropy': 2.75, 'score': 43.8262050593046}
{'word': '想做', 'pmi': 45.333333333333336, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 43.74837083261218}
{'word': '手术室', 'pmi': 44.0, 'left_entropy': 1.584962500721156, 'right_entropy': 1.584962500721156, 'score': 42.4150374992