In [2]:
import os
import time
import numpy as np
from sklearn import datasets
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# 1. Load and preprocess numeric datasets (Iris, Breast Cancer)
def load_numeric_dataset(name):
    if name == 'iris':
        data = datasets.load_iris()
    elif name == 'breast':
        data = load_breast_cancer()
    else:
        raise ValueError('Unknown dataset')
    X, y = data.data, data.target
    X = StandardScaler().fit_transform(X)
    return X, y

# 2. Load and preprocess THUCNews (expects structure: root/class_name/*.txt)
def load_thucnews(root_dir, categories=None):
    texts, labels = [], []
    classes = sorted([d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))])
    if categories:
        classes = [c for c in classes if c in categories]
    for idx, cls in enumerate(classes):
        cls_dir = os.path.join(root_dir, cls)
        for fname in os.listdir(cls_dir):
            path = os.path.join(cls_dir, fname)
            with open(path, 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(idx)
    return texts, np.array(labels), classes

# 3. Define feature selectors and classifiers
def get_feature_selectors(task):
    if task == 'numeric':
        # Ensure k <= n_features (Iris=4, BreastCancer=30)
        return {
            'ANOVA_k2': SelectKBest(f_classif, k=2),
            'ANOVA_k3': SelectKBest(f_classif, k=3)
        }
    elif task == 'text':
        return {
            'CHI2_5000': SelectKBest(chi2, k=5000),
            'None': None
        }

classifiers = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(probability=True)
}

# 4. Experiment function
def run_experiment(X_train, X_test, y_train, y_test, selectors, task, vectorizer=None):
    results = []
    for sel_name, selector in selectors.items():
        for clf_name, clf in classifiers.items():
            steps = []
            if task == 'text':
                steps.append(('tfidf', vectorizer))
            if selector is not None:
                steps.append(('select', selector))
            steps.append(('clf', clf))
            pipeline = Pipeline(steps)
            start = time.time()
            pipeline.fit(X_train, y_train)
            duration = time.time() - start
            y_pred = pipeline.predict(X_test)
            y_prob = pipeline.predict_proba(X_test)
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_test, y_pred, average='weighted')
            try:
                auc = roc_auc_score(y_test, y_prob, multi_class='ovo', average='weighted')
            except Exception:
                auc = None
            results.append({
                'selector': sel_name,
                'classifier': clf_name,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'roc_auc': auc,
                'time': duration
            })
            # Plot ROC
            if auc is not None:
                n_classes = len(np.unique(y_test))
                fpr = dict()
                tpr = dict()
                for i in range(n_classes):
                    fpr[i], tpr[i], _ = roc_curve((y_test == i).astype(int), y_prob[:, i])
                plt.figure()
                for i in range(n_classes):
                    plt.plot(fpr[i], tpr[i], label=f'Class {i}')
                plt.title(f'ROC curves: {task}, {sel_name}, {clf_name}')
                plt.xlabel('FPR'); plt.ylabel('TPR'); plt.legend()
                plt.savefig(f'roc_{task}_{sel_name}_{clf_name}.png')
                plt.close()
    return results

# 5. Main
def main():
    all_results = {}
    # Numeric datasets
    for name, task in [('iris', 'numeric'), ('breast', 'numeric')]:
        X, y = load_numeric_dataset(name)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        selectors = get_feature_selectors(task)
        res = run_experiment(X_train, X_test, y_train, y_test, selectors, task)
        all_results[name] = res

    # Text dataset
    # thuc_dir = '/kaggle/input/thucnews/cnews.train.txt'  # 修改为实际路径
    # texts, labels, classes = load_thucnews(thuc_dir)
    # X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)
    # vectorizer = TfidfVectorizer(max_features=10000)
    # selectors = get_feature_selectors('text')
    # res = run_experiment(X_train, X_test, y_train, y_test, selectors, 'text', vectorizer)
    # all_results['THUCNews'] = res

    # Print summary
    for ds, results in all_results.items():
        print(f"Results for {ds}:")
        for r in results:
            print(r)

if __name__ == '__main__':
    main()


Results for iris:
{'selector': 'ANOVA_k2', 'classifier': 'LogisticRegression', 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': 1.0, 'time': 0.006333589553833008}
{'selector': 'ANOVA_k2', 'classifier': 'RandomForest', 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': 1.0, 'time': 0.12044358253479004}
{'selector': 'ANOVA_k2', 'classifier': 'SVM', 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': 1.0, 'time': 0.0025246143341064453}
{'selector': 'ANOVA_k3', 'classifier': 'LogisticRegression', 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': 1.0, 'time': 0.006680488586425781}
{'selector': 'ANOVA_k3', 'classifier': 'RandomForest', 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': 1.0, 'time': 0.11666297912597656}
{'selector': 'ANOVA_k3', 'classifier': 'SVM', 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': 1.0, 'time': 0.0027773380279541016}
Results for breast:
{'selector': 'ANOVA_k2', 'classifier': 'LogisticRegression', '

In [3]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_recall_fscore_support,
    roc_auc_score,
    roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# 实验配置
CONFIG = {
    "test_size": 0.3,
    "random_state": 42,
    "max_features": 10000,  # TF-IDF特征维度
    "chi2_k": 5000,         # 卡方检验选择特征数
    "n_classes": 10,        # 显示ROC曲线的top N类别
    "plot_roc": True        # 是否生成可视化
}

def load_thucnews(file_path, sample_limit=None):
    """加载THUCNews数据集（格式：类别\t文本内容）"""
    texts = []
    labels = []
    label_dict = {}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if sample_limit and i >= sample_limit:
                break
                
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
                
            label_name, text = parts
            if label_name not in label_dict:
                label_dict[label_name] = len(label_dict)
                
            texts.append(text)
            labels.append(label_dict[label_name])
    
    print(f"加载完成：{len(texts)}个样本，{len(label_dict)}个类别")
    return texts, np.array(labels), list(label_dict.keys())

def run_experiment(X_train, X_test, y_train, y_test, class_names):
    """执行分类实验"""
    results = []
    vectorizer = TfidfVectorizer(max_features=CONFIG["max_features"])
    
    # 定义实验组件
    selectors = {
        'CHI2': SelectKBest(chi2, k=CONFIG["chi2_k"]),
        'None': None
    }
    
    classifiers = {
        'LR': LogisticRegression(max_iter=1000, n_jobs=-1),
        'RF': RandomForestClassifier(n_estimators=100, n_jobs=-1),
        'SVM': SVC(probability=True, kernel='linear')
    }

    # 遍历所有组合
    for sel_name, selector in selectors.items():
        for clf_name, clf in classifiers.items():
            # 构建流水线
            steps = [('tfidf', vectorizer)]
            if selector is not None:
                steps.append(('selector', selector))
            steps.append(('clf', clf))
            
            # 训练模型
            start = time.time()
            model = Pipeline(steps).fit(X_train, y_train)
            train_time = time.time() - start
            
            # 预测结果
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)
            
            # 计算指标
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_test, y_pred, average='weighted'
            )
            try:
                auc = roc_auc_score(
                    y_test, y_proba,
                    multi_class='ovo',
                    average='weighted'
                )
            except:
                auc = None
            
            # 记录结果
            results.append({
                'selector': sel_name,
                'classifier': clf_name,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc,
                'time': train_time
            })
            
            # 可视化部分
            if CONFIG["plot_roc"] and auc is not None:
                plt.figure(figsize=(10, 8))
                for i in range(min(CONFIG["n_classes"], len(class_names))):
                    fpr, tpr, _ = roc_curve(
                        (y_test == i).astype(int), 
                        y_proba[:, i]
                    )
                    plt.plot(fpr, tpr, 
                             label=f'{class_names[i]} (AUC={roc_auc_score((y_test == i), y_proba[:, i]):.2f})')
                
                plt.plot([0, 1], [0, 1], 'k--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'ROC - {sel_name}+{clf_name}\nWeighted AUC: {auc:.3f}')
                plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.tight_layout()
                plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
                plt.close()
    
    return results

def print_results(results, class_names):
    """格式化输出结果"""
    print("\n{:=^80}".format(" 实验结果汇总 "))
    print("{:<15} {:<15} {:<8} {:<8} {:<8} {:<10}".format(
        '特征选择', '分类器', 'Precision', 'Recall', 'F1', 'AUC'))
    
    for res in results:
        print("{:<15} {:<15} {:<8.3f} {:<8.3f} {:<8.3f} {:<10.3f}".format(
            res['selector'], 
            res['classifier'],
            res['precision'],
            res['recall'],
            res['f1'],
            res['auc'] if res['auc'] else 0
        ))
    
    print("\n{:=^80}".format(" 类别说明 "))
    for i, name in enumerate(class_names):
        print(f"{i}. {name}")

if __name__ == "__main__":
    # 加载数据
    texts, labels, class_names = load_thucnews(
        "/kaggle/input/thucnews/cnews.train.txt",
        sample_limit=10000  # 快速测试时启用
    )
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels,
        test_size=CONFIG["test_size"],
        random_state=CONFIG["random_state"]
    )
    
    # 运行实验
    results = run_experiment(X_train, X_test, y_train, y_test, class_names)
    
    # 输出结果
    print_results(results, class_names)

加载完成：10000个样本，2个类别

特征选择            分类器             Precision Recall   F1       AUC       
CHI2            LR              0.986    0.985    0.985    0.000     
CHI2            RF              0.971    0.971    0.971    0.000     
CHI2            SVM             0.988    0.988    0.988    0.000     
None            LR              0.986    0.986    0.986    0.000     
None            RF              0.974    0.974    0.974    0.000     
None            SVM             0.991    0.991    0.991    0.000     

0. 体育
1. 娱乐


In [4]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_recall_fscore_support,
    roc_auc_score,
    roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# 实验配置
CONFIG = {
    "test_size": 0.3,
    "random_state": 42,
    "max_features": 10000,  # TF-IDF特征维度
    "chi2_k": 5000,         # 卡方检验选择特征数
    "n_classes": 10,        # 显示ROC曲线的top N类别
    "plot_roc": True        # 是否生成可视化
}

def load_thucnews(file_path, sample_limit=None):
    """加载THUCNews数据集（格式：类别\t文本内容）"""
    texts = []
    labels = []
    label_dict = {}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if sample_limit and i >= sample_limit:
                break
                
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
                
            label_name, text = parts
            if label_name not in label_dict:
                label_dict[label_name] = len(label_dict)
                
            texts.append(text)
            labels.append(label_dict[label_name])
    
    print(f"加载完成：{len(texts)}个样本，{len(label_dict)}个类别")
    return texts, np.array(labels), list(label_dict.keys())

def run_experiment(X_train, X_test, y_train, y_test, class_names):
    """执行分类实验"""
    results = []
    vectorizer = TfidfVectorizer(max_features=CONFIG["max_features"])
    
    # 定义实验组件
    selectors = {
        'CHI2': SelectKBest(chi2, k=CONFIG["chi2_k"]),
        'None': None
    }
    
    classifiers = {
        'LR': LogisticRegression(max_iter=1000, n_jobs=-1),
        'RF': RandomForestClassifier(n_estimators=100, n_jobs=-1),
        'SVM': SVC(probability=True, kernel='linear')
    }

    # 遍历所有组合
    for sel_name, selector in selectors.items():
        for clf_name, clf in classifiers.items():
            # 构建流水线
            steps = [('tfidf', vectorizer)]
            if selector is not None:
                steps.append(('selector', selector))
            steps.append(('clf', clf))
            
            # 训练模型
            start = time.time()
            model = Pipeline(steps).fit(X_train, y_train)
            train_time = time.time() - start
            
            # 预测结果
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)
            
            # 计算指标
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_test, y_pred, average='weighted'
            )
            try:
                auc = roc_auc_score(
                    y_test, y_proba,
                    multi_class='ovo',
                    average='weighted'
                )
            except:
                auc = None
            
            # 记录结果
            results.append({
                'selector': sel_name,
                'classifier': clf_name,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc,
                'time': train_time
            })
            
            # 可视化部分
            if CONFIG["plot_roc"] and auc is not None:
                plt.figure(figsize=(10, 8))
                for i in range(min(CONFIG["n_classes"], len(class_names))):
                    fpr, tpr, _ = roc_curve(
                        (y_test == i).astype(int), 
                        y_proba[:, i]
                    )
                    plt.plot(fpr, tpr, 
                             label=f'{class_names[i]} (AUC={roc_auc_score((y_test == i), y_proba[:, i]):.2f})')
                
                plt.plot([0, 1], [0, 1], 'k--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'ROC - {sel_name}+{clf_name}\nWeighted AUC: {auc:.3f}')
                plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.tight_layout()
                plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
                plt.close()
    
    return results

def print_results(results, class_names):
    """格式化输出结果"""
    print("\n{:=^80}".format(" 实验结果汇总 "))
    print("{:<15} {:<15} {:<8} {:<8} {:<8} {:<10}".format(
        '特征选择', '分类器', 'Precision', 'Recall', 'F1', 'AUC'))
    
    for res in results:
        print("{:<15} {:<15} {:<8.3f} {:<8.3f} {:<8.3f} {:<10.3f}".format(
            res['selector'], 
            res['classifier'],
            res['precision'],
            res['recall'],
            res['f1'],
            res['auc'] if res['auc'] else 0
        ))
    
    print("\n{:=^80}".format(" 类别说明 "))
    for i, name in enumerate(class_names):
        print(f"{i}. {name}")

if __name__ == "__main__":
    # 加载数据
    texts, labels, class_names = load_thucnews(
        "/kaggle/input/thucnews/cnews.train.txt",
        sample_limit=50000  # 快速测试时启用
    )
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels,
        test_size=CONFIG["test_size"],
        random_state=CONFIG["random_state"]
    )
    
    # 运行实验
    results = run_experiment(X_train, X_test, y_train, y_test, class_names)
    
    # 输出结果
    print_results(results, class_names)

加载完成：50000个样本，10个类别


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=


特征选择            分类器             Precision Recall   F1       AUC       
CHI2            LR              0.838    0.817    0.823    0.972     
CHI2            RF              0.771    0.757    0.761    0.953     
CHI2            SVM             0.837    0.805    0.815    0.973     
None            LR              0.846    0.832    0.837    0.976     
None            RF              0.784    0.771    0.774    0.959     
None            SVM             0.845    0.826    0.832    0.977     

0. 体育
1. 娱乐
2. 家居
3. 房产
4. 教育
5. 时尚
6. 时政
7. 游戏
8. 科技
9. 财经


In [5]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_recall_fscore_support,
    roc_auc_score,
    roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# 实验配置
CONFIG = {
    "test_size": 0.3,
    "random_state": 42,
    "max_features": 10000,  # TF-IDF特征维度
    "chi2_k": 5000,         # 卡方检验选择特征数
    "n_classes": 10,        # 显示ROC曲线的top N类别
    "plot_roc": True        # 是否生成可视化
}

def load_thucnews(file_path, sample_limit=None):
    """加载THUCNews数据集（格式：类别\t文本内容）"""
    texts = []
    labels = []
    label_dict = {}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if sample_limit and i >= sample_limit:
                break
                
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
                
            label_name, text = parts
            if label_name not in label_dict:
                label_dict[label_name] = len(label_dict)
                
            texts.append(text)
            labels.append(label_dict[label_name])
    
    print(f"加载完成：{len(texts)}个样本，{len(label_dict)}个类别")
    return texts, np.array(labels), list(label_dict.keys())

def run_experiment(X_train, X_test, y_train, y_test, class_names):
    """执行分类实验"""
    results = []
    vectorizer = TfidfVectorizer(max_features=CONFIG["max_features"])
    
    # 定义实验组件
    selectors = {
        'CHI2': SelectKBest(chi2, k=CONFIG["chi2_k"]),
        'None': None
    }
    
    classifiers = {
        'LR': LogisticRegression(max_iter=1000, n_jobs=-1),
        'RF': RandomForestClassifier(n_estimators=100, n_jobs=-1),
        'SVM': SVC(probability=True, kernel='linear')
    }

    # 遍历所有组合
    for sel_name, selector in selectors.items():
        for clf_name, clf in classifiers.items():
            # 构建流水线
            steps = [('tfidf', vectorizer)]
            if selector is not None:
                steps.append(('selector', selector))
            steps.append(('clf', clf))
            
            # 训练模型
            start = time.time()
            model = Pipeline(steps).fit(X_train, y_train)
            train_time = time.time() - start
            
            # 预测结果
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)
            
            # 计算指标
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_test, y_pred, average='weighted'
            )
            try:
                auc = roc_auc_score(
                    y_test, y_proba,
                    multi_class='ovo',
                    average='weighted'
                )
            except:
                auc = None
            
            # 记录结果
            results.append({
                'selector': sel_name,
                'classifier': clf_name,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc,
                'time': train_time
            })
            
            # 可视化部分
            if CONFIG["plot_roc"] and auc is not None:
                plt.figure(figsize=(10, 8))
                for i in range(min(CONFIG["n_classes"], len(class_names))):
                    fpr, tpr, _ = roc_curve(
                        (y_test == i).astype(int), 
                        y_proba[:, i]
                    )
                    plt.plot(fpr, tpr, 
                             label=f'{class_names[i]} (AUC={roc_auc_score((y_test == i), y_proba[:, i]):.2f})')
                
                plt.plot([0, 1], [0, 1], 'k--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'ROC - {sel_name}+{clf_name}\nWeighted AUC: {auc:.3f}')
                plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.tight_layout()
                plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
                plt.close()
    
    return results

def print_results(results, class_names):
    """格式化输出结果"""
    print("\n{:=^80}".format(" 实验结果汇总 "))
    print("{:<15} {:<15} {:<8} {:<8} {:<8} {:<10}".format(
        '特征选择', '分类器', 'Precision', 'Recall', 'F1', 'AUC'))
    
    for res in results:
        print("{:<15} {:<15} {:<8.3f} {:<8.3f} {:<8.3f} {:<10.3f}".format(
            res['selector'], 
            res['classifier'],
            res['precision'],
            res['recall'],
            res['f1'],
            res['auc'] if res['auc'] else 0
        ))
    
    print("\n{:=^80}".format(" 类别说明 "))
    for i, name in enumerate(class_names):
        print(f"{i}. {name}")

if __name__ == "__main__":
    # 加载数据
    texts, labels, class_names = load_thucnews(
        "/kaggle/input/thucnews/cnews.train.txt",
        sample_limit=20000  # 快速测试时启用
    )
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels,
        test_size=CONFIG["test_size"],
        random_state=CONFIG["random_state"]
    )
    
    # 运行实验
    results = run_experiment(X_train, X_test, y_train, y_test, class_names)
    
    # 输出结果
    print_results(results, class_names)

加载完成：20000个样本，4个类别


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_name}.png', dpi=300)
  plt.savefig(f'ROC_{sel_name}_{clf_n


特征选择            分类器             Precision Recall   F1       AUC       
CHI2            LR              0.955    0.953    0.953    0.994     
CHI2            RF              0.945    0.943    0.943    0.991     
CHI2            SVM             0.959    0.957    0.957    0.995     
None            LR              0.962    0.961    0.961    0.995     
None            RF              0.946    0.945    0.945    0.992     
None            SVM             0.966    0.965    0.965    0.996     

0. 体育
1. 娱乐
2. 家居
3. 房产
