In [3]:
#构建模型

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer #我们使用TF-IDF

df = pd.read_csv("E:/作业/杂物/web日志分析/final_dataset_400k.csv")

y = df["label"]

path = df["path"].fillna("")
status = df["status"].fillna(0)
method = df["method"].fillna("")
size = df["size"].fillna(0)
referer = df["referer"].fillna("")
user_agent = df["user_agent"].fillna("")

vec_path = TfidfVectorizer(max_features=5000)
vec_ua = TfidfVectorizer(max_features=5000)
vec_referer = TfidfVectorizer(max_features=5000)

x_path = vec_path.fit_transform(path)
x_ua = vec_ua.fit_transform(user_agent)
x_referer =vec_referer.fit_transform(referer)

ohe = OneHotEncoder(handle_unknown='ignore')  # One-Hot 编码
x_method = ohe.fit_transform(method.values.reshape(-1, 1))

x_status= status.values.reshape(-1, 1)   # 数值特征转成二维
x_size= size.values.reshape(-1, 1)


x = hstack([x_path, x_ua, x_referer, x_method, x_status, x_size])  # 合并所有特征必须使用稀疏矩阵拼接

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 模型3:XGBoost

import xgboost as xgb
import numpy as np
params={
    'objective':'binary:logistic', #目标函数，我们明显是二分类用 binary:logistic
    'eval_metric':'logloss', #评估指标，这里用 logloss
    'eta': 0.05, #学习率，值越小越稳越慢默认0.1
    'max_depth': 8, #每棵树的深度，越深越复杂一般4-6复杂可到8
    'subsample': 1.0, #样本采样比例，防止过拟合
    'colsample_bytree': 0.8, #特征采样比例
}  #不是特别理解好像就是不同的参数,感觉很难记住

y_np = np.array(y_train)
dtrain = xgb.DMatrix(X_train, label=y_np) 
bst = xgb.train(params, dtrain, num_boost_round=100)
dtest = xgb.DMatrix(X_test)
y_pred_prob = bst.predict(dtest)   # 输出是概率
y_pred = (y_pred_prob > 0.5).astype(int)
print("XGBoost：")
print(classification_report(y_test, y_pred))


✅ 清理完成！新文件已保存为：final_dataset_clean.csv


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
import pandas as pd
import numpy as np

print("对未标注数据进行预测...")  # 和前边一模一样的操作只不过没有y而已
df = pd.read_csv("E:/作业/杂物/web日志分析/final_dataset_clean.csv")

path = df["path"].fillna("")
status = df["status"].fillna(0)
method = df["method"].fillna("")
size = df["size"].fillna(0)
referer = df["referer"].fillna("")
user_agent = df["user_agent"].fillna("")

x_path = vec_path.transform(path)
x_ua = vec_ua.transform(user_agent)
x_referer = vec_referer.transform(referer)

x_method = ohe.transform(method.values.reshape(-1, 1))

x_status = status.values.reshape(-1, 1)
x_size = size.values.reshape(-1, 1)

X_unlabeled = hstack([x_path, x_ua, x_referer, x_method, x_status, x_size])# 合并所有特征成稀疏矩阵
dtest = xgb.DMatrix(X_unlabeled)
probs = bst.predict(dtest) # 开始预测

import numpy as np
import pandas as pd
from scipy.sparse import vstack

high_confidence_idx = np.where((probs > 0.95) | (probs < 0.05))[0]  # 选出置信度高的索引

pseudo_labels_high_conf = (probs[high_confidence_idx] > 0.5).astype(int)  # 置信度对应的伪标签（概率>0.5为1，否则0）

X_pseudo = X_unlabeled[high_confidence_idx]  # 从无标签特征中选出高置信度样本

y_pseudo = pseudo_labels_high_conf  # 把伪标签加入标签集合

X_train_new = vstack([X_train, X_pseudo])
y_train_new = np.concatenate([y_train, y_pseudo])  # 将伪标签样本加入训练集（用稀疏矩阵垂直堆叠）

import xgboost as xgb

dtrain_new = xgb.DMatrix(X_train_new, label=y_train_new)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 1.0,
    'colsample_bytree': 0.8,
}

bst_new = xgb.train(params, dtrain_new, num_boost_round=100)

max_iters = 3
X_train_iter = X_train
y_train_iter = y_train
X_unlabeled_iter = X_unlabeled

for i in range(max_iters):
    print(f"开始第{i+1}轮半监督训练")
    dtrain_iter = xgb.DMatrix(X_train_iter, label=y_train_iter)
    bst_iter = xgb.train(params, dtrain_iter, num_boost_round=100)
    
    dtest_iter = xgb.DMatrix(X_unlabeled_iter)
    probs_iter = bst_iter.predict(dtest_iter)
    
    high_conf_idx = np.where((probs_iter > 0.95) | (probs_iter < 0.05))[0]
    if len(high_conf_idx) == 0:
        print("无更多高置信度样本，停止迭代")
        break
    
    pseudo_labels_iter = (probs_iter[high_conf_idx] > 0.5).astype(int)
    X_pseudo_iter = X_unlabeled_iter[high_conf_idx]
    
    # 加入训练集
    X_train_iter = vstack([X_train_iter, X_pseudo_iter])
    y_train_iter = np.concatenate([y_train_iter, pseudo_labels_iter])
    
    # 从无标签数据中剔除已用样本
    mask = np.ones(X_unlabeled_iter.shape[0], dtype=bool)
    mask[high_conf_idx] = False
    X_unlabeled_iter = X_unlabeled_iter[mask]
    
    print(f"第{i+1}轮加入了{len(high_conf_idx)}个伪标签样本")

In [None]:
import xgboost as xgb

dtrain_new = xgb.DMatrix(X_train_new, label=y_train_new)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 1.0,
    'colsample_bytree': 0.8,
}

bst_new = xgb.train(params, dtrain_new, num_boost_round=100)

In [None]:
max_iters = 3
X_train_iter = X_train
y_train_iter = y_train
X_unlabeled_iter = X_unlabeled

for i in range(max_iters):
    print(f"开始第{i+1}轮半监督训练")
    dtrain_iter = xgb.DMatrix(X_train_iter, label=y_train_iter)
    bst_iter = xgb.train(params, dtrain_iter, num_boost_round=100)
    
    dtest_iter = xgb.DMatrix(X_unlabeled_iter)
    probs_iter = bst_iter.predict(dtest_iter)
    
    high_conf_idx = np.where((probs_iter > 0.95) | (probs_iter < 0.05))[0]
    if len(high_conf_idx) == 0:
        print("无更多高置信度样本，停止迭代")
        break
    
    pseudo_labels_iter = (probs_iter[high_conf_idx] > 0.5).astype(int)
    X_pseudo_iter = X_unlabeled_iter[high_conf_idx]
    
    # 加入训练集
    X_train_iter = vstack([X_train_iter, X_pseudo_iter])
    y_train_iter = np.concatenate([y_train_iter, pseudo_labels_iter])
    
    # 从无标签数据中剔除已用样本
    mask = np.ones(X_unlabeled_iter.shape[0], dtype=bool)
    mask[high_conf_idx] = False
    X_unlabeled_iter = X_unlabeled_iter[mask]
    
    print(f"第{i+1}轮加入了{len(high_conf_idx)}个伪标签样本")

In [5]:
# 构建模型
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, vstack
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import numpy as np

# 读取训练数据
df = pd.read_csv("E:/作业/杂物/web日志分析/final_dataset_400k.csv")

y = df["label"]

# 数据预处理
path = df["path"].fillna("")
status = df["status"].fillna(0)
method = df["method"].fillna("")
size = df["size"].fillna(0)
referer = df["referer"].fillna("")
user_agent = df["user_agent"].fillna("")

# 文本特征向量化
vec_path = TfidfVectorizer(max_features=5000)
vec_ua = TfidfVectorizer(max_features=5000)
vec_referer = TfidfVectorizer(max_features=5000)

x_path = vec_path.fit_transform(path)
x_ua = vec_ua.fit_transform(user_agent)
x_referer = vec_referer.fit_transform(referer)

# 类别特征编码
ohe = OneHotEncoder(handle_unknown='ignore')
x_method = ohe.fit_transform(method.values.reshape(-1, 1))

# 数值特征
x_status = status.values.reshape(-1, 1)
x_size = size.values.reshape(-1, 1)

# 合并所有特征并转换为CSR格式（支持索引操作）
x = hstack([x_path, x_ua, x_referer, x_method, x_status, x_size]).tocsr()

# 分割训练测试集
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# XGBoost参数设置
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 1.0,
    'colsample_bytree': 0.8,
}

# 训练初始模型
print("训练初始XGBoost模型...")
y_np = np.array(y_train)
dtrain = xgb.DMatrix(X_train, label=y_np)
bst = xgb.train(params, dtrain, num_boost_round=100)

# 在测试集上评估初始模型
dtest = xgb.DMatrix(X_test)
y_pred_prob = bst.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)
print("初始XGBoost模型性能：")
print(classification_report(y_test, y_pred))

# 读取无标签数据
print("读取无标签数据...")
df_unlabeled = pd.read_csv("E:/作业/杂物/web日志分析/final_dataset_clean.csv")

# 对无标签数据进行相同的预处理
path_unlabeled = df_unlabeled["path"].fillna("")
status_unlabeled = df_unlabeled["status"].fillna(0)
method_unlabeled = df_unlabeled["method"].fillna("")
size_unlabeled = df_unlabeled["size"].fillna(0)
referer_unlabeled = df_unlabeled["referer"].fillna("")
user_agent_unlabeled = df_unlabeled["user_agent"].fillna("")

# 使用已训练的向量化器进行转换
x_path_unlabeled = vec_path.transform(path_unlabeled)
x_ua_unlabeled = vec_ua.transform(user_agent_unlabeled)
x_referer_unlabeled = vec_referer.transform(referer_unlabeled)

x_method_unlabeled = ohe.transform(method_unlabeled.values.reshape(-1, 1))

x_status_unlabeled = status_unlabeled.values.reshape(-1, 1)
x_size_unlabeled = size_unlabeled.values.reshape(-1, 1)

# 合并无标签数据的所有特征并转换为CSR格式
X_unlabeled = hstack([x_path_unlabeled, x_ua_unlabeled, x_referer_unlabeled, 
                     x_method_unlabeled, x_status_unlabeled, x_size_unlabeled]).tocsr()

# 半监督学习迭代过程
print("开始半监督学习...")
max_iters = 5  # 增加迭代次数
confidence_threshold = 0.9  # 可以调整置信度阈值

X_train_iter = X_train.copy().tocsr()  # 确保是CSR格式
y_train_iter = y_train.copy()
X_unlabeled_iter = X_unlabeled.copy().tocsr()  # 确保是CSR格式

total_pseudo_samples = 0

for i in range(max_iters):
    print(f"\n开始第{i+1}轮半监督训练")
    print(f"当前训练集大小: {X_train_iter.shape[0]}")
    print(f"当前无标签数据大小: {X_unlabeled_iter.shape[0]}")
    
    # 训练当前模型
    dtrain_iter = xgb.DMatrix(X_train_iter, label=y_train_iter)
    bst_iter = xgb.train(params, dtrain_iter, num_boost_round=100)
    
    # 对无标签数据进行预测
    dtest_iter = xgb.DMatrix(X_unlabeled_iter)
    probs_iter = bst_iter.predict(dtest_iter)
    
    # 选择高置信度样本
    high_conf_idx = np.where((probs_iter > confidence_threshold) | 
                           (probs_iter < (1 - confidence_threshold)))[0]
    
    if len(high_conf_idx) == 0:
        print("无更多高置信度样本，停止迭代")
        break
    
    # 生成伪标签
    pseudo_labels_iter = (probs_iter[high_conf_idx] > 0.5).astype(int)
    X_pseudo_iter = X_unlabeled_iter[high_conf_idx]
    
    # 将伪标签样本加入训练集
    X_train_iter = vstack([X_train_iter, X_pseudo_iter])
    y_train_iter = np.concatenate([y_train_iter, pseudo_labels_iter])
    
    # 从无标签数据中移除已使用的样本
    mask = np.ones(X_unlabeled_iter.shape[0], dtype=bool)
    mask[high_conf_idx] = False
    X_unlabeled_iter = X_unlabeled_iter[mask]
    
    total_pseudo_samples += len(high_conf_idx)
    print(f"第{i+1}轮加入了{len(high_conf_idx)}个伪标签样本")
    print(f"其中正样本: {np.sum(pseudo_labels_iter)}, 负样本: {len(pseudo_labels_iter) - np.sum(pseudo_labels_iter)}")

print(f"\n半监督学习完成，共加入{total_pseudo_samples}个伪标签样本")

# 训练最终模型
print("\n训练最终模型...")
dtrain_final = xgb.DMatrix(X_train_iter, label=y_train_iter)
bst_final = xgb.train(params, dtrain_final, num_boost_round=100)

# 在测试集上评估最终模型
dtest_final = xgb.DMatrix(X_test)
y_pred_prob_final = bst_final.predict(dtest_final)
y_pred_final = (y_pred_prob_final > 0.5).astype(int)

print("最终模型性能：")
print(classification_report(y_test, y_pred_final))

# 比较初始模型和最终模型的性能
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("\n性能对比:")
print("指标\t\t初始模型\t最终模型")
print(f"准确率\t\t{accuracy_score(y_test, y_pred):.4f}\t\t{accuracy_score(y_test, y_pred_final):.4f}")
print(f"精确率\t\t{precision_score(y_test, y_pred):.4f}\t\t{precision_score(y_test, y_pred_final):.4f}")
print(f"召回率\t\t{recall_score(y_test, y_pred):.4f}\t\t{recall_score(y_test, y_pred_final):.4f}")
print(f"F1分数\t\t{f1_score(y_test, y_pred):.4f}\t\t{f1_score(y_test, y_pred_final):.4f}")


import pickle
import joblib
import xgboost as xgb
import pandas as pd
import numpy as np
import os
from datetime import datetime

# 在训练完成后保存模型和预处理器
print("保存模型和预处理器...")

# 设置你想要的保存路径
SAVE_PATH = "E:/作业/杂物/web日志分析/models"

# 确保保存目录存在
os.makedirs(SAVE_PATH, exist_ok=True)

# 保存XGBoost模型（推荐）
model_filename = os.path.join(SAVE_PATH, f'semi_supervised_xgb_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
bst_final.save_model(model_filename)
print(f"XGBoost模型已保存为: {model_filename}")

训练初始XGBoost模型...
初始XGBoost模型性能：
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     40134
           1       1.00      0.99      0.99     39866

    accuracy                           0.99     80000
   macro avg       0.99      0.99      0.99     80000
weighted avg       0.99      0.99      0.99     80000

读取无标签数据...
开始半监督学习...

开始第1轮半监督训练
当前训练集大小: 320000
当前无标签数据大小: 400000
第1轮加入了386833个伪标签样本
其中正样本: 193441, 负样本: 193392

开始第2轮半监督训练
当前训练集大小: 706833
当前无标签数据大小: 13167
第2轮加入了4196个伪标签样本
其中正样本: 689, 负样本: 3507

开始第3轮半监督训练
当前训练集大小: 711029
当前无标签数据大小: 8971
第3轮加入了699个伪标签样本
其中正样本: 126, 负样本: 573

开始第4轮半监督训练
当前训练集大小: 711728
当前无标签数据大小: 8272
第4轮加入了185个伪标签样本
其中正样本: 61, 负样本: 124

开始第5轮半监督训练
当前训练集大小: 711913
当前无标签数据大小: 8087
第5轮加入了146个伪标签样本
其中正样本: 43, 负样本: 103

半监督学习完成，共加入392059个伪标签样本

训练最终模型...
最终模型性能：
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     40134
           1       0.99      0.99      0.99     3

NameError: name 'os' is not defined

In [6]:
import pickle
import joblib
import xgboost as xgb
import pandas as pd
import numpy as np
import os
from datetime import datetime

# 在训练完成后保存模型和预处理器
print("保存模型和预处理器...")

# 设置你想要的保存路径
SAVE_PATH = "E:/作业/杂物/web日志分析/models"

# 确保保存目录存在
os.makedirs(SAVE_PATH, exist_ok=True)

# 保存XGBoost模型（推荐）
model_filename = os.path.join(SAVE_PATH, f'semi_supervised_xgb_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
bst_final.save_model(model_filename)
print(f"XGBoost模型已保存为: {model_filename}")

保存模型和预处理器...
XGBoost模型已保存为: E:/作业/杂物/web日志分析/models\semi_supervised_xgb_model_20250805_155259.json
