# 前一步已经确认随机森林效果最优
改进

In [1]:
# 导入必要的库
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB  # 导入朴素贝叶斯算法
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import time


# 加载数据函数，加载指定文件夹内的数据文件，并按顺序合并
def load_te_data(folder_path):
    data = []  # 存储数据
    labels = []  # 存储标签
    files = sorted(os.listdir(folder_path))  # 保证文件顺序一致
    for file in files:
        if file.endswith('.dat'):  # 只处理以 '.dat' 结尾的文件
            file_path = os.path.join(folder_path, file)
            # 读取文件，假设是空格分隔的数据
            df = pd.read_csv(file_path, sep='\\s+', header=None)
            # 提取标签（假设标签位于文件名中）
            label = int(file[1:3])  # 从文件名中提取标签，例如 'd00.dat' 提取 '00'
            data.append(df)  # 添加数据
            labels.append(np.full((df.shape[0],), label))  # 添加对应的标签
    # 合并所有数据和标签
    return pd.concat(data, axis=0), np.concatenate(labels, axis=0)

# 加载训练集和测试集数据
train_data, train_labels = load_te_data('TE_train')
test_data, test_labels = load_te_data('TE_test')

# 将标签添加到数据中，方便后续操作
train_data['Label'] = train_labels
test_data['Label'] = test_labels

# 确保训练集和测试集特征对齐，去除标签列进行对比
common_columns = train_data.columns.intersection(test_data.columns).drop('Label')
train_data = train_data[common_columns.to_list() + ['Label']]
test_data = test_data[common_columns.to_list() + ['Label']]

# 数据标准化：为了后续模型训练，确保特征具有相同的尺度
scaler = StandardScaler()
X_train = scaler.fit_transform(train_data.drop('Label', axis=1))  # 训练集特征标准化
X_test = scaler.transform(test_data.drop('Label', axis=1))  # 测试集特征标准化
y_train = train_data['Label'].values  # 训练集标签
y_test = test_data['Label'].values  # 测试集标签

# 输出训练集和测试集的大小
print(f"训练集大小: {X_train.shape}, 测试集大小: {X_test.shape}")
# 定义函数检查类别分布
def check_class_distribution(y_train, y_test):
    unique_train, counts_train = np.unique(y_train, return_counts=True)  # 训练集类别分布
    unique_test, counts_test = np.unique(y_test, return_counts=True)  # 测试集类别分布
    print("训练集类别分布:", dict(zip(unique_train, counts_train)))
    print("测试集类别分布:", dict(zip(unique_test, counts_test)))

# 检查训练集和测试集的类别分布
check_class_distribution(y_train, y_test)

# 如果训练集类别不平衡，使用 SMOTE 技术进行过采样
smote = SMOTE(random_state=42)  # 初始化 SMOTE 对象
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)  # 对训练集进行过采样

# 输出过采样前后的训练集大小
print(f"原始训练集大小: {X_train.shape}, 增强后训练集大小: {X_train_resampled.shape}")


训练集大小: (10132, 52), 测试集大小: (21120, 52)
训练集类别分布: {np.int64(0): np.int64(52), np.int64(1): np.int64(480), np.int64(2): np.int64(480), np.int64(3): np.int64(480), np.int64(4): np.int64(480), np.int64(5): np.int64(480), np.int64(6): np.int64(480), np.int64(7): np.int64(480), np.int64(8): np.int64(480), np.int64(9): np.int64(480), np.int64(10): np.int64(480), np.int64(11): np.int64(480), np.int64(12): np.int64(480), np.int64(13): np.int64(480), np.int64(14): np.int64(480), np.int64(15): np.int64(480), np.int64(16): np.int64(480), np.int64(17): np.int64(480), np.int64(18): np.int64(480), np.int64(19): np.int64(480), np.int64(20): np.int64(480), np.int64(21): np.int64(480)}
测试集类别分布: {np.int64(0): np.int64(960), np.int64(1): np.int64(960), np.int64(2): np.int64(960), np.int64(3): np.int64(960), np.int64(4): np.int64(960), np.int64(5): np.int64(960), np.int64(6): np.int64(960), np.int64(7): np.int64(960), np.int64(8): np.int64(960), np.int64(9): np.int64(960), np.int64(10): np.int64(960), np.in

In [2]:
# 训练并评估模型的函数
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name, track_loss=False):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # 预测
    y_pred = model.predict(X_test)
    
    # 计算指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # 混淆矩阵
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # 保存分类报告到txt文件
    report = classification_report(y_test, y_pred, zero_division=0)
    with open(f"test_{model_name}_classification_report.txt", "w") as f:
        f.write(f"classification_report ({model_name}):\n")
        f.write(report)  # 保存报告内容


    # 返回结果
    return {
        "model_name": model_name,
        "train_time": train_time,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "conf_matrix": conf_matrix,
    }

# 初始化模型（加权分类器处理类别不平衡）
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# 训练并评估模型
results = []
for model, name in zip([rf], ["Random Forest"]):
    result = train_and_evaluate(model, X_train, y_train, X_test, y_test, name)
    results.append(result)
    

## 修改开始

In [3]:
# 训练一个随机森林来评估特征重要性
rf.fit(X_train_resampled, y_train_resampled)

# 获取特征重要性
feature_importances = rf.feature_importances_

# 选择重要性较高的特征（例如选择前50%最重要的特征）
threshold = np.percentile(feature_importances, 50)
selected_features = np.where(feature_importances >= threshold)[0]

# 只保留选定的特征
X_train_selected = X_train_resampled[:, selected_features]
X_test_selected = X_test[:, selected_features]

print(f"选择的特征数: {X_train_selected.shape[1]}")


选择的特征数: 26


In [4]:
import time
import numpy as np
from sklearn.decomposition import PCA  # 导入主成分分析（PCA）降维工具
from sklearn.feature_selection import RFE  # 导入递归特征消除（RFE）工具
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 训练并评估模型的函数
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    start_time = time.time()
    # 训练模型
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    # 使用训练好的模型进行预测
    y_pred = model.predict(X_test)
    # 准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} 准确率: {accuracy:.4f}")
    
    return {
        "model_name": model_name,
        "train_time": train_time,
        "accuracy": accuracy
    }

# 初始化随机森林模型
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# 1. 原始数据（使用SMOTE）
results_original = []
result_original = train_and_evaluate(rf, X_train_resampled, y_train_resampled, X_test, y_test, "Random Forest with SMOTE")
results_original.append(result_original)

# 2. PCA降维，保留95%的方差信息
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_resampled)
X_test_pca = pca.transform(X_test)

# 训练并评估降维后的随机森林模型
results_pca = []
result_pca = train_and_evaluate(rf, X_train_pca, y_train_resampled, X_test_pca, y_test, "Random Forest with PCA")
results_pca.append(result_pca)

# 3. 基于特征重要性的特征选择
rf_feature_selector = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_feature_selector.fit(X_train_resampled, y_train_resampled)  # 训练随机森林模型
# 获取每个特征的特征重要性分数
importances = rf_feature_selector.feature_importances_
# 按特征重要性分数对特征进行排序，从最重要到最不重要
indices = np.argsort(importances)[::-1]
# 选择前20个最重要的特征
selected_features = indices[:20]
# 提取出这些最重要的特征
X_train_selected = X_train_resampled[:, selected_features]
X_test_selected = X_test[:, selected_features]

# 训练并评估仅使用最重要特征的随机森林模型
results_selected = []
result_selected = train_and_evaluate(rf, X_train_selected, y_train_resampled, X_test_selected, y_test, "Random Forest with Feature Selection")
results_selected.append(result_selected) 

# 4. 使用RFE进行特征选择
# 初始化一个线性SVM模型，用于RFE特征选择
svm = SVC(kernel='linear', random_state=42)
# 初始化RFE进行特征选择，选择20个最优特征
rfe = RFE(svm, n_features_to_select=20)
# 训练RFE选择最重要的特征
rfe.fit(X_train_resampled, y_train_resampled)
X_train_rfe = X_train_resampled[:, rfe.support_]  # 选出训练集中的重要特征
X_test_rfe = X_test[:, rfe.support_]  # 选出测试集中的重要特征
# 训练并评估使用RFE选出的特征的随机森林模型
results_rfe = []
result_rfe = train_and_evaluate(rf, X_train_rfe, y_train_resampled, X_test_rfe, y_test, "Random Forest with RFE")
results_rfe.append(result_rfe)


Random Forest with SMOTE 准确率: 0.5345
Random Forest with PCA 准确率: 0.1758
Random Forest with Feature Selection 准确率: 0.5006
Random Forest with RFE 准确率: 0.4961


In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import time

# 加载数据函数，加载指定文件夹内的数据文件，并按顺序合并
def load_te_data(folder_path):
    data = []  # 存储所有数据的列表
    labels = []  # 存储所有标签的列表
    files = sorted(os.listdir(folder_path))  # 获取文件夹中的所有文件，并按字母顺序排序
    for file in files:
        if file.endswith('.dat'):  # 只处理以 '.dat' 结尾的文件
            file_path = os.path.join(folder_path, file)  # 获取文件的完整路径
            # 读取文件，假设文件内容是空格分隔的数值数据
            df = pd.read_csv(file_path, sep='\\s+', header=None)  # 使用 pandas 读取数据
            # 从文件名中提取标签（假设标签位于文件名的第2和第3个字符位置）
            label = int(file[1:3])  # 例如 'd00.dat' 提取 '00' 作为标签
            data.append(df)  # 将读取的数据添加到 data 列表
            labels.append(np.full((df.shape[0],), label))  # 将对应的标签填充为与数据行数相同的数组，并添加到 labels 列表
    # 合并所有的数据和标签，返回一个 DataFrame 和一个包含标签的 numpy 数组
    return pd.concat(data, axis=0), np.concatenate(labels, axis=0)

# 加载训练集和测试集数据
train_data, train_labels = load_te_data('TE_train')  # 加载训练数据
test_data, test_labels = load_te_data('TE_test')  # 加载测试数据

# 将标签添加到数据中，方便后续操作
train_data['Label'] = train_labels  # 将标签列添加到训练集数据中
test_data['Label'] = test_labels  # 将标签列添加到测试集数据中

# 确保训练集和测试集特征对齐，去除标签列进行对比
common_columns = train_data.columns.intersection(test_data.columns).drop('Label')  # 找出两者公共的特征列（去掉标签列）
# 重新调整训练集和测试集的数据，确保特征列一致
train_data = train_data[common_columns.to_list() + ['Label']]  # 选择公共特征列和标签列
test_data = test_data[common_columns.to_list() + ['Label']]  # 选择公共特征列和标签列

# 数据标准化：为了后续模型训练，确保特征具有相同的尺度
scaler = StandardScaler()  # 初始化标准化处理器
X_train = scaler.fit_transform(train_data.drop('Label', axis=1))  # 训练集特征标准化（去除标签列）
X_test = scaler.transform(test_data.drop('Label', axis=1))  # 测试集特征标准化（去除标签列）
y_train = train_data['Label'].values  # 获取训练集标签
y_test = test_data['Label'].values  # 获取测试集标签

# SMOTE处理类别不平衡：对训练集进行过采样，平衡各类别样本数
smote = SMOTE(random_state=42)  # 初始化 SMOTE 对象
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)  # 进行过采样，返回新的训练集数据和标签

# 训练并评估模型的函数
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    start_time = time.time() 
    model.fit(X_train, y_train) 
    train_time = time.time() - start_time 
    # 进行预测
    y_pred = model.predict(X_test)
    # 准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} 准确率: {accuracy:.4f}")
    return {
        "model_name": model_name,
        "train_time": train_time,
        "accuracy": accuracy
    }

# 初始化基础分类器
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)  # 随机森林
svc = SVC(kernel='linear', probability=True, random_state=42)  # 支持向量机
log_reg = LogisticRegression(class_weight='balanced', random_state=42)  # 逻辑回归
xgb_model = xgb.XGBClassifier(n_estimators=100, scale_pos_weight=10, use_label_encoder=False, eval_metric='mlogloss', random_state=42)  # XGBoost

# 初始化集成学习模型（Soft Voting）
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),  # 随机森林
    ('svc', svc),  # 支持向量机
    ('log_reg', log_reg),  # 逻辑回归
    ('xgb', xgb_model)  # XGBoost
], voting='soft')  # 使用软投票策略

# 训练并评估集成模型
result_voting = train_and_evaluate(voting_clf, X_train_resampled, y_train_resampled, X_test, y_test, "Voting Classifier (RF + SVC + Logistic Regression + XGBoost)")

print(f"\n最终集成学习模型的准确率: {result_voting['accuracy']:.4f}")


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



Voting Classifier (RF + SVC + Logistic Regression + XGBoost) 准确率: 0.5843

最终集成学习模型的准确率: 0.5843
