In [1]:
import re
import os
from jieba import cut
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split  # 新增训练测试分割
from sklearn.metrics import classification_report  # 新增评估报告

In [2]:
def get_words(filename):
    """读取文本并过滤无效字符和长度为1的词"""
    words = []
    with open(filename, 'r', encoding='utf-8') as fr:
        for line in fr:
            line = line.strip()
            line = re.sub(r'[.【】0-9、——。，！~\*]', '', line)
            line = cut(line)
            line = filter(lambda word: len(word) > 1, line)
            words.extend(line)
    return words

In [3]:
def get_texts(filenames):
    """读取所有文件内容并返回分词后的文本列表"""
    texts = []
    for filename in filenames:
        with open(filename, 'r', encoding='utf-8') as fr:
            text = fr.read()
            text = re.sub(r'[.【】0-9、——。，！~\*]', '', text)
            words = cut(text)
            words = filter(lambda word: len(word) > 1, words)
            texts.append(' '.join(words))
    return texts

In [4]:
def extract_features(method='frequency', top_num=100):
    """特征提取函数，支持两种模式"""
    filename_list = ['邮件_files/{}.txt'.format(i) for i in range(151)]
    
    if method == 'frequency':
        all_words = []
        for filename in filename_list:
            all_words.append(get_words(filename))
        freq = Counter(chain(*all_words))
        top_words = [i[0] for i in freq.most_common(top_num)]
        
        vector = []
        for words in all_words:
            word_map = list(map(lambda word: words.count(word), top_words))
            vector.append(word_map)
        return np.array(vector), top_words
    
    elif method == 'tfidf':
        texts = get_texts(filename_list)
        tfidf = TfidfVectorizer(max_features=top_num)
        vector = tfidf.fit_transform(texts).toarray()
        top_words = tfidf.get_feature_names_out()
        return vector, top_words
    
    else:
        raise ValueError("method必须是 'frequency' 或 'tfidf'")

In [5]:
# 参数配置
method = 'frequency'  # 可切换为'tfidf'
top_num = 100
test_size = 0.2  # 测试集比例
random_state = 42  # 随机种子

In [6]:
# 提取特征和标签
vector, top_words = extract_features(method=method, top_num=top_num)
labels = np.array([1]*127 + [0]*24)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\17519\AppData\Local\Temp\jieba.cache
Loading model cost 0.812 seconds.
Prefix dict has been built successfully.


In [7]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    vector, labels, test_size=test_size, random_state=random_state, stratify=labels
)

In [8]:
# 使用SMOTE过采样（仅对训练集）
print("\n过采样前类别分布:", np.bincount(y_train))
smote = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("过采样后类别分布:", np.bincount(y_train_resampled))


过采样前类别分布: [ 19 101]
过采样后类别分布: [101 101]


In [9]:
# 训练模型
model = MultinomialNB()
model.fit(X_train_resampled, y_train_resampled)

In [10]:
# 在测试集上评估
y_pred = model.predict(X_test)
print("\n========== 分类评估报告 ==========")
print(classification_report(y_test, y_pred, target_names=['普通邮件', '垃圾邮件']))
print("=================================")


              precision    recall  f1-score   support

        普通邮件       0.33      1.00      0.50         5
        垃圾邮件       1.00      0.62      0.76        26

    accuracy                           0.68        31
   macro avg       0.67      0.81      0.63        31
weighted avg       0.89      0.68      0.72        31



In [11]:
def predict(filename, method='frequency'):
    """预测函数，支持两种特征模式"""
    if method == 'frequency':
        words = get_words(filename)
        current_vector = np.array([words.count(word) for word in top_words])
    elif method == 'tfidf':
        text = ' '.join(get_words(filename))
        tfidf = TfidfVectorizer(vocabulary=top_words)
        current_vector = tfidf.fit_transform([text]).toarray()[0]
    else:
        raise ValueError("method必须是 'frequency' 或 'tfidf'")
    
    result = model.predict(current_vector.reshape(1, -1))
    return '垃圾邮件' if result == 1 else '普通邮件'

In [12]:
# 测试分类
test_files = ['邮件_files/{}.txt'.format(i) for i in range(151, 156)]
print("\n测试邮件分类结果:")
for file in test_files:
    print(f'{file}分类情况: {predict(file, method=method)}')


测试邮件分类结果:
邮件_files/151.txt分类情况: 普通邮件
邮件_files/152.txt分类情况: 垃圾邮件
邮件_files/153.txt分类情况: 普通邮件
邮件_files/154.txt分类情况: 垃圾邮件
邮件_files/155.txt分类情况: 普通邮件
