In [3]:
import random

input_path = "E:/作业/杂物/web日志分析/log01.csv"
output_path = "E:/作业/杂物/web日志分析/unlabeled_logs_sampled.csv"

sample_size = 400000  # 你要抽的行数
reservoir = []
with open(input_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < sample_size:
            reservoir.append(line)
        else:
            j = random.randint(0, i)
            if j < sample_size:
                reservoir[j] = line

with open(output_path, 'w', encoding='utf-8') as out:
    out.writelines(reservoir)

print(f"✅ 已从原始日志中随机抽取 {sample_size} 行，保存到 {output_path}")



✅ 已从原始日志中随机抽取 400000 行，保存到 E:/作业/杂物/web日志分析/unlabeled_logs_sampled.csv


In [12]:
#构建模型

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer #我们使用TF-IDF

df = pd.read_csv("E:/作业/杂物/web日志分析/final_dataset_400k.csv")

y = df["label"]

path = df["path"].fillna("")
status = df["status"].fillna(0)
method = df["method"].fillna("")
size = df["size"].fillna(0)
referer = df["referer"].fillna("")
user_agent = df["user_agent"].fillna("")

vec_path = TfidfVectorizer(max_features=5000)
vec_ua = TfidfVectorizer(max_features=5000)
vec_referer = TfidfVectorizer(max_features=5000)

x_path = vec_path.fit_transform(path)
x_ua = vec_ua.fit_transform(user_agent)
x_referer =vec_referer.fit_transform(referer)

ohe = OneHotEncoder(handle_unknown='ignore')  # One-Hot 编码
x_method = ohe.fit_transform(method.values.reshape(-1, 1))

x_status= status.values.reshape(-1, 1)   # 数值特征转成二维
x_size= size.values.reshape(-1, 1)


x = hstack([x_path, x_ua, x_referer, x_method, x_status, x_size])  # 合并所有特征必须使用稀疏矩阵拼接

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 模型3:XGBoost

import xgboost as xgb
import numpy as np
params={
    'objective':'binary:logistic', #目标函数，我们明显是二分类用 binary:logistic
    'eval_metric':'logloss', #评估指标，这里用 logloss
    'eta': 0.05, #学习率，值越小越稳越慢默认0.1
    'max_depth': 8, #每棵树的深度，越深越复杂一般4-6复杂可到8
    'subsample': 1.0, #样本采样比例，防止过拟合
    'colsample_bytree': 0.8, #特征采样比例
}  #不是特别理解好像就是不同的参数,感觉很难记住

y_np = np.array(y_train)
dtrain = xgb.DMatrix(X_train, label=y_np) 
bst = xgb.train(params, dtrain, num_boost_round=100)
dtest = xgb.DMatrix(X_test)
y_pred_prob = bst.predict(dtest)   # 输出是概率
y_pred = (y_pred_prob > 0.5).astype(int)
print("XGBoost：")
print(classification_report(y_test, y_pred))


XGBoost：
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     40134
           1       1.00      0.99      0.99     39866

    accuracy                           0.99     80000
   macro avg       0.99      0.99      0.99     80000
weighted avg       0.99      0.99      0.99     80000



In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
import pandas as pd
import numpy as np

print("对未标注数据进行预测...")  # 和前边一模一样的操作只不过没有y而已
df = pd.read_csv("E:/作业/杂物/web日志分析/unlabeled_logs_sampled_with_header.csv")

path = df["path"].fillna("")
status = df["status"].fillna(0)
method = df["method"].fillna("")
size = df["size"].fillna(0)
referer = df["referer"].fillna("")
user_agent = df["user_agent"].fillna("")

x_path = vec_path.transform(path)
x_ua = vec_ua.transform(user_agent)
x_referer = vec_referer.transform(referer)

x_method = ohe.transform(method.values.reshape(-1, 1))

x_status = status.values.reshape(-1, 1)
x_size = size.values.reshape(-1, 1)

X_unlabeled = hstack([x_path, x_ua, x_referer, x_method, x_status, x_size])# 合并所有特征成稀疏矩阵
dtest = xgb.DMatrix(X_unlabeled)
probs = bst.predict(dtest) # 开始预测

pseudo_labels = (probs > 0.5).astype(int)  # 概率>0.5为1，否则0

df["label1"] = pseudo_labels  # 把伪标签加入原始数据集

print("预测完成")
# ...你已有的代码...

df["label1"] = pseudo_labels  # 把伪标签加入内存中DataFrame
print("预测完成")

# 保存到新文件
df.to_csv("E:/作业/杂物/web日志分析/unlabeled_logs_sampled_with_label1.csv", index=False)
print("已保存带伪标签的新数据集！")


对未标注数据进行预测...
预测完成
预测完成
已保存带伪标签的新数据集！


In [6]:
import pandas as pd

cols = ['ip','time','method',"path", "status", "size", "referer", "user_agent"]

df = pd.read_csv("E:/作业/杂物/web日志分析/unlabeled_logs_sampled.csv", header=None, names=cols)

# 重新写回CSV，写入列名，覆盖原文件或者写新文件
df.to_csv("E:/作业/杂物/web日志分析/unlabeled_logs_sampled_with_header.csv", index=False)



In [16]:
import pandas as pd

df = pd.read_csv("E:/作业/杂物/web日志分析/unlabeled_logs_sampled_with_label1.csv")  # 读取数据

df['score'] = 0
df['label'] = 0  # 默认都是正常的，后续高分的我们再打成 1

for i, row in df.iterrows():  #和上边的评分规则一模一样
    score = 0
    path = str(row['path']) if pd.notna(row['path']) else ""
    status = str(row['status']) if pd.notna(row['status']) else ""
    user_agent = str(row['user_agent']) if pd.notna(row['user_agent']) else ""

    if len(path) > 100:
        score += 2
    dangerous_words = ['script', 'union', 'select', 'drop', 'admin', '..', 'passwd']
    for word in dangerous_words:
        if word.lower() in path.lower():
            score += 3
    if status.startswith('4') or status.startswith('5'):
        score += 1
    if len(user_agent) < 20:
        score += 2
    special_chars = ['<', '>', '%', '&', ';']
    for char in special_chars:
        if char in path:
            score += 1

    df.at[i, 'score'] = score
    df.at[i, 'label'] = 1 if score >= 3 else 0  # 可疑打为1，正常为0


df.to_csv("E:/作业/杂物/web日志分析/unlabeled_logs_sampled_with_label1and2.csv", index=False, encoding='utf-8-sig')  # 保存为新的带伪标签的数据集

print("打标签完成")

打标签完成


In [17]:
import pandas as pd

df = pd.read_csv("E:/作业/杂物/web日志分析/unlabeled_logs_sampled_with_label1and2.csv")

# 计算两个标签不一样的样本数
diff_count = (df['label1'] != df['label']).sum()

# 总样本数
total = len(df)

# 不一样的比例
diff_ratio = diff_count / total

print(f"标签不一致的数量：{diff_count}")
print(f"总样本数：{total}")
print(f"标签不一致的比例：{diff_ratio:.4f}（{diff_ratio*100:.2f}%）")


标签不一致的数量：1766
总样本数：400000
标签不一致的比例：0.0044（0.44%）


In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import vstack


high_confidence_idx = np.where((probs > 0.95) | (probs < 0.05))[0]  # 选出置信度高的索引


pseudo_labels_high_conf = (probs[high_confidence_idx] > 0.5).astype(int)  # 置信度对应的伪标签（概率>0.5为1，否则0）


X_pseudo = X_unlabeled[high_confidence_idx]  # 从无标签特征中选出高置信度样本


y_pseudo = pseudo_labels_high_conf  # 把伪标签加入标签集合


X_train_new = vstack([X_train, X_pseudo])
y_train_new = np.concatenate([y_train, y_pseudo])  # 将伪标签样本加入训练集（用稀疏矩阵垂直堆叠）


In [None]:
import xgboost as xgb

dtrain_new = xgb.DMatrix(X_train_new, label=y_train_new)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 1.0,
    'colsample_bytree': 0.8,
}

bst_new = xgb.train(params, dtrain_new, num_boost_round=100)


In [None]:
max_iters = 3
X_train_iter = X_train
y_train_iter = y_train
X_unlabeled_iter = X_unlabeled

for i in range(max_iters):
    print(f"开始第{i+1}轮半监督训练")
    dtrain_iter = xgb.DMatrix(X_train_iter, label=y_train_iter)
    bst_iter = xgb.train(params, dtrain_iter, num_boost_round=100)
    
    dtest_iter = xgb.DMatrix(X_unlabeled_iter)
    probs_iter = bst_iter.predict(dtest_iter)
    
    high_conf_idx = np.where((probs_iter > 0.95) | (probs_iter < 0.05))[0]
    if len(high_conf_idx) == 0:
        print("无更多高置信度样本，停止迭代")
        break
    
    pseudo_labels_iter = (probs_iter[high_conf_idx] > 0.5).astype(int)
    X_pseudo_iter = X_unlabeled_iter[high_conf_idx]
    
    # 加入训练集
    X_train_iter = vstack([X_train_iter, X_pseudo_iter])
    y_train_iter = np.concatenate([y_train_iter, pseudo_labels_iter])
    
    # 从无标签数据中剔除已用样本
    mask = np.ones(X_unlabeled_iter.shape[0], dtype=bool)
    mask[high_conf_idx] = False
    X_unlabeled_iter = X_unlabeled_iter[mask]
    
    print(f"第{i+1}轮加入了{len(high_conf_idx)}个伪标签样本")
