In [8]:
import re
import os
from jieba import cut
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer  # 新增TF-IDF支持

In [9]:
def get_words(filename):
    """读取文本并过滤无效字符和长度为1的词（与原代码一致）"""
    words = []
    with open(filename, 'r', encoding='utf-8') as fr:
        for line in fr:
            line = line.strip()
            line = re.sub(r'[.【】0-9、——。，！~\*]', '', line)
            line = cut(line)
            line = filter(lambda word: len(word) > 1, line)
            words.extend(line)
    return words

In [10]:
def get_texts(filenames):
    """读取所有文件内容并返回分词后的文本列表（用于TF-IDF）"""
    texts = []
    for filename in filenames:
        with open(filename, 'r', encoding='utf-8') as fr:
            text = fr.read()
            text = re.sub(r'[.【】0-9、——。，！~\*]', '', text)
            words = cut(text)
            words = filter(lambda word: len(word) > 1, words)
            texts.append(' '.join(words))  # 用空格连接分词结果
    return texts

In [11]:
def extract_features(method='frequency', top_num=100):
    """特征提取的切换函数"""
    filename_list = ['邮件_files/{}.txt'.format(i) for i in range(151)]
    
    if method == 'frequency':
        # 高频词特征（原代码逻辑）
        all_words = []
        for filename in filename_list:
            all_words.append(get_words(filename))
        freq = Counter(chain(*all_words))
        top_words = [i[0] for i in freq.most_common(top_num)]
        
        # 构建词频向量
        vector = []
        for words in all_words:
            word_map = list(map(lambda word: words.count(word), top_words))
            vector.append(word_map)
        return np.array(vector), top_words
    
    elif method == 'tfidf':
        # TF-IDF加权特征
        texts = get_texts(filename_list)
        tfidf = TfidfVectorizer(max_features=top_num)
        vector = tfidf.fit_transform(texts).toarray()
        top_words = tfidf.get_feature_names_out()  # 获取特征词
        return vector, top_words
    
    else:
        raise ValueError("method必须是 'frequency' 或 'tfidf'")

In [16]:
# 参数化选择特征方法（可切换为 'frequency'）
method = 'tfidf' #或'frequency'
vector, top_words = extract_features(method=method, top_num=100)

In [14]:
# 标签和模型训练（与原代码一致）
labels = np.array([1]*127 + [0]*24)
model = MultinomialNB()
model.fit(vector, labels)

def predict(filename, method='frequency'):
    """对未知邮件分类（支持两种特征方法）"""
    if method == 'frequency':
        words = get_words(filename)
        current_vector = np.array([words.count(word) for word in top_words])
    elif method == 'tfidf':
        text = ' '.join(get_words(filename))  # 分词后拼接为字符串
        tfidf = TfidfVectorizer(vocabulary=top_words)  # 使用训练时的词表
        current_vector = tfidf.fit_transform([text]).toarray()[0]
    else:
        raise ValueError("method必须是 'frequency' 或 'tfidf'")
    
    result = model.predict(current_vector.reshape(1, -1))
    return '垃圾邮件' if result == 1 else '普通邮件'

In [15]:
# 测试分类
test_files = ['邮件_files/{}.txt'.format(i) for i in range(151, 156)]
for file in test_files:
    print(f'{file}分类情况: {predict(file, method=method)}')

邮件_files/151.txt分类情况: 垃圾邮件
邮件_files/152.txt分类情况: 垃圾邮件
邮件_files/153.txt分类情况: 垃圾邮件
邮件_files/154.txt分类情况: 垃圾邮件
邮件_files/155.txt分类情况: 垃圾邮件
