In [None]:
import os
import re
import jieba
import math
from nltk import FreqDist, word_tokenize
from collections import Counter
from tqdm import tqdm
import matplotlib.pyplot as plt

# 设置matplotlib支持中文显示
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.rcParams['axes.unicode_minus'] = False

# 中文语料信息熵计算

In [5]:
folder_path = 'data/wiki_zh'
corpus_parts = []

for childfolder in tqdm(os.listdir(folder_path)):
    childfolder_path = os.path.join(folder_path, childfolder)
    if os.path.isdir(childfolder_path):
        for filename in os.listdir(childfolder_path):
            file_path = os.path.join(childfolder_path, filename)
            if os.path.isfile(file_path):
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        corpus_parts.append(file.read())
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

corpus = ''.join(corpus_parts)
print(f"length: {len(corpus)}")

100%|██████████| 13/13 [00:03<00:00,  3.85it/s]


length: 565215433


In [None]:
words = jieba.cut(corpus, cut_all=False)
word_list = list(words)

In [None]:
fdist = FreqDist(word_list)

def calculate_entropy(fdist):
    total = sum(fdist.values())
    entropy = 0
    for count in fdist.values():
        prob = count / total
        entropy -= prob * math.log2(prob)
    return entropy

word_entropy = calculate_entropy(fdist)
print(f"词级别平均信息熵: {word_entropy:}")

char_list = list(corpus)
char_freq = Counter(char_list)
total_chars = sum(char_freq.values())
char_entropy = calculate_entropy(char_freq)
print(f"字级别的平均信息熵: {char_entropy:.3f}")

词级别平均信息熵: 9.854372492925405
字级别的平均信息熵: 13.355834532880332


In [None]:
import nltk
import math
import re
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

# 下载古腾堡语料库
nltk.download('gutenberg')
r1 = r'[^a-zA-Z\s]'
text = gutenberg.raw()
#去掉所有的换行和多余的空格
text = re.sub(r1, ' ', text) 
text = text.lower()
text = text.replace('\n', ' ')
text = " ".join(text.split())

words = text.split()
letters = [char for char in text if char.isalpha()]

fwords_dict= FreqDist(words)
fletters_dict= FreqDist(letters)

In [None]:
letter_entropy = calculate_entropy(fwords_dict)
print(f"字母级别的平均信息熵: {letter_entropy}")

word_entropy = calculate_entropy(fletters_dict)
print(f"单词级别的平均信息熵: {word_entropy}")

字母级别的平均信息熵: 4.158618822194603
单词级别的平均信息熵: 9.728047292817916
