In [1]:
'''
Trie树，字典树，前缀树。

'''

import sys
print(sys.version)

3.8.3 (default, Jul  2 2020, 11:26:31) 
[Clang 10.0.0 ]


In [2]:
'''
建立一棵仅支持ASCII码字符集合的树，对词频进行统计。
ASCII ((American Standard Code for Information Interchange): 美国信息交换标准代码）
'''
class trieNode(object):
    def __init__(self):
        self.count = 0
        self.next = [None] * 128 # 仅处理Ascii码
    
    
class trieTreeAscii(object):
    def __init__(self):
        self.trie_root = trieNode()
    
    
    def find_word_node(self, word):
        # 找到当前词所在的节点
        root = self.trie_root
        for char in word:
            char_idx = ord(char)
            root.next[char_idx] = trieNode() if root.next[char_idx] == None else root.next[char_idx]
            root = root.next[char_idx]
        return root
    
    
    def add_sentence(self, sentence, count=1):
        # 对当句子中每个词的词频 +1，以空格分割
        for word in sentence.split(' '):
            self.find_word_node(word).count += count
    
    
    def add_word(self, word, count=1):
        # 对当前词的词频 +1
        self.find_word_node(word).count += count
    
    
    def dec_word(self, word, count=1):
        # 对当前词的词频 -1
        word_node = self.find_word_node(word)
        word_node.count = 0 if word_node.count < count else word_node.count - count
    
    
    def search_word(self, word):
        # 获得一个词的词频
        return self.find_word_node(word).count
    
    
    def get_all_word(self, word='', node=None):
        # 获取所有词及他们的词频
        
        if word == '':
            node = self.trie_root
            
        if node == None:
            return []
        
        word_set = []
        for char_idx, next_node in enumerate(node.next):
            
            if next_node != None:
                word_new = word + chr(char_idx)
                
                if next_node.count > 0:
                    word_set.append((word_new, next_node.count))
                word_set += self.get_all_word(word_new, next_node)
                
        return word_set

In [3]:
t = trieTreeAscii()
t.add_word('word')
t.add_word('ward')
t.add_word('I')
t.add_word('am')
t.add_sentence('How are you ?')
t.add_sentence('I \' m fine .')
t.add_sentence('I am fine .')

print(t.get_all_word())

t.dec_word('am')
print('count of am is:', t.search_word('am'))
t.dec_word('am')
print('count of am is:', t.search_word('am'))
t.dec_word('am')
print('count of am is:', t.search_word('am'))

t.add_sentence('To love and To work .')
print(t.get_all_word())

[("'", 1), ('.', 2), ('?', 1), ('How', 1), ('I', 3), ('am', 2), ('are', 1), ('fine', 2), ('m', 1), ('ward', 1), ('word', 1), ('you', 1)]
count of am is: 1
count of am is: 0
count of am is: 0
[("'", 1), ('.', 3), ('?', 1), ('How', 1), ('I', 3), ('To', 2), ('and', 1), ('are', 1), ('fine', 2), ('love', 1), ('m', 1), ('ward', 1), ('word', 1), ('work', 1), ('you', 1)]
