In [10]:
import os
import json
import pandas as pd
import logging
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
import gradio as gr
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib


df = pd.read_csv("data/test_login_logs.csv", encoding="utf-8")



  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def preprocess_data(df):
        """
        数据清洗与特征提取
        """
        logging.info("开始数据清洗和特征提取...")

        # 检查空值并填充
        if df.isnull().sum().sum() > 0:
            logging.warning(f"原始数据中存在空值，进行填充处理。\n空值统计:\n{df.isnull().sum()}")
            df.fillna(value={"登录时间": pd.NaT, "登录结果": "unknown", "用户ID": "unknown_user", "登录地址": "0.0.0.0"}, inplace=True)

        # 转换登录时间
        df["登录时间"] = pd.to_datetime(df["登录时间"], errors="coerce")
        if df["登录时间"].isnull().any():
            logging.warning("部分登录时间无法转换为时间格式，将填充为最早时间。")
            df["登录时间"].fillna(df["登录时间"].min(), inplace=True)

        # 将登录结果转换为数值
        df["登录结果"] = df["登录结果"].apply(lambda x: 1 if x == "success" else 0 if x == "failure" else -1)

        # 计算登录失败次数
        df["登录失败次数"] = df.groupby("用户ID")["登录结果"].transform(lambda x: (x == 0).sum())

        # 计算登录成功率
        df["登录成功率"] = df.groupby("用户ID")["登录结果"].transform(lambda x: (x == 1).mean())

        # 计算时间范围（分钟）
        df["时间范围分钟"] = df.groupby("用户ID")["登录时间"].transform(
            lambda x: (x.max() - x.min()).total_seconds() / 60 if len(x.dropna()) > 1 else 0
        )

        # 每分钟失败比例
        df["每分钟失败比例"] = df["登录失败次数"] / (df["时间范围分钟"].replace(0, 1))  # 避免除以零

        # 按用户ID和登录时间排序
        df = df.sort_values(by=["用户ID", "登录时间"])

        # 标记失败登录
        df["失败登录"] = (df["登录结果"] == 0).astype(int)

        # 计算时间间隔（秒）
        df["时间间隔"] = df.groupby("用户ID")["登录时间"].diff().dt.total_seconds().fillna(0)

        # 标记连续失败组（时间间隔大于60秒为新组）
        df["失败组"] = (
            (df["失败登录"] == 1) & ((df["时间间隔"] > 60) | (df["时间间隔"].isna()))
        ).cumsum()

        # 统计每组失败次数
        df["连续失败次数"] = df.groupby(["用户ID", "失败组"])["失败登录"].transform("sum")

        # 添加连续失败3次特征
        df["连续失败3次"] = (df["连续失败次数"] >= 3).astype(int)

        # 对用户ID进行编码
        encoder = LabelEncoder()
        df["用户编码"] = encoder.fit_transform(df["用户ID"])

        # 检查非数值列是否仍有异常值
        non_numeric_columns = df.select_dtypes(include=["object"]).columns
        for col in non_numeric_columns:
            unique_values = df[col].unique()
            logging.info(f"列 {col} 的唯一值：{unique_values[:5]} (仅显示前5个)")

        # 确保所有列无缺失值
        if df.isnull().sum().sum() > 0:
            logging.error("数据清洗后仍存在缺失值，请检查数据处理流程。")
            raise ValueError("数据清洗后仍存在缺失值。")

        logging.info("数据清洗完成。")
        logging.info(f"清洗后的数据预览：\n{df.head()}")
        return df

In [12]:
pre_df = preprocess_data(df)
pre_df

Unnamed: 0,用户ID,登录时间,登录地址,登录资源,登录结果,登录失败次数,登录成功率,时间范围分钟,每分钟失败比例,失败登录,时间间隔,失败组,连续失败次数,连续失败3次,用户编码
10325,unknown_user,2023-01-25 00:00:00,192.168.1.5,server2,0,26,0.480000,49.0,0.530612,1,0.0,0,26,1,0
10326,unknown_user,2023-01-25 00:01:00,198.51.100.27,server2,1,26,0.480000,49.0,0.530612,0,60.0,0,26,1,0
10327,unknown_user,2023-01-25 00:02:00,10.0.0.20,server1,1,26,0.480000,49.0,0.530612,0,60.0,0,26,1,0
10328,unknown_user,2023-01-25 00:03:00,172.16.0.40,server1,0,26,0.480000,49.0,0.530612,1,60.0,0,26,1,0
10329,unknown_user,2023-01-25 00:04:00,172.16.0.44,server2,0,26,0.480000,49.0,0.530612,1,60.0,0,26,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8389,user99,2023-01-06 19:49:00,203.0.113.47,server2,1,15,0.732143,9646.0,0.001555,0,2880.0,2032,1,0,200
8418,user99,2023-01-06 20:18:00,203.0.113.15,server3,0,15,0.732143,9646.0,0.001555,1,1740.0,2033,1,0,200
8421,user99,2023-01-06 20:21:00,192.168.1.24,server2,1,15,0.732143,9646.0,0.001555,0,180.0,2033,1,0,200
8788,user99,2023-01-07 02:28:00,203.0.113.8,server1,1,15,0.732143,9646.0,0.001555,0,22020.0,2033,1,0,200


In [13]:
# 提取特征
features = ["登录失败次数", "登录成功率", "时间范围分钟", "每分钟失败比例", "连续失败3次"]
X_test = pre_df[features]

In [16]:
model = joblib.load("anomaly_detection_model.pkl")
pre_df["异常类型编码"] = model.predict(X_test)
pre_df["异常类型编码"] = pre_df["异常类型编码"].round().astype(int)



In [17]:
pre_df

Unnamed: 0,用户ID,登录时间,登录地址,登录资源,登录结果,登录失败次数,登录成功率,时间范围分钟,每分钟失败比例,失败登录,时间间隔,失败组,连续失败次数,连续失败3次,用户编码,异常类型编码
10325,unknown_user,2023-01-25 00:00:00,192.168.1.5,server2,0,26,0.480000,49.0,0.530612,1,0.0,0,26,1,0,1
10326,unknown_user,2023-01-25 00:01:00,198.51.100.27,server2,1,26,0.480000,49.0,0.530612,0,60.0,0,26,1,0,1
10327,unknown_user,2023-01-25 00:02:00,10.0.0.20,server1,1,26,0.480000,49.0,0.530612,0,60.0,0,26,1,0,1
10328,unknown_user,2023-01-25 00:03:00,172.16.0.40,server1,0,26,0.480000,49.0,0.530612,1,60.0,0,26,1,0,1
10329,unknown_user,2023-01-25 00:04:00,172.16.0.44,server2,0,26,0.480000,49.0,0.530612,1,60.0,0,26,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8389,user99,2023-01-06 19:49:00,203.0.113.47,server2,1,15,0.732143,9646.0,0.001555,0,2880.0,2032,1,0,200,3
8418,user99,2023-01-06 20:18:00,203.0.113.15,server3,0,15,0.732143,9646.0,0.001555,1,1740.0,2033,1,0,200,3
8421,user99,2023-01-06 20:21:00,192.168.1.24,server2,1,15,0.732143,9646.0,0.001555,0,180.0,2033,1,0,200,3
8788,user99,2023-01-07 02:28:00,203.0.113.8,server1,1,15,0.732143,9646.0,0.001555,0,22020.0,2033,1,0,200,3


In [18]:
anomalies_map="""
{
    "1": "频繁登录失败, 未知用户登录",
    "2": "频繁登录失败, 非正常源地址, 未知用户登录",
    "3": "",
    "4": "非正常源地址",
    "5": "频繁登录失败",
    "6": "频繁登录失败, 非正常源地址"
}

"""

In [19]:
pre_df["异常类型"] = pre_df["异常类型编码"].map(anomalies_map)

TypeError: 'str' object is not callable

In [None]:
pre_df

In [2]:
df

Unnamed: 0,用户ID,登录时间,登录地址,登录资源,登录结果
0,user20,2023-01-01 00:00:00,192.168.1.10,server2,success
1,user103,2023-01-01 00:01:00,192.168.1.11,server2,success
2,user99,2023-01-01 00:02:00,10.0.0.7,server1,success
3,user169,2023-01-01 00:03:00,203.0.113.20,server1,success
4,user159,2023-01-01 00:04:00,172.16.0.22,server2,failure
...,...,...,...,...,...
10370,unknown_user,2023-01-25 00:45:00,203.0.113.5,server1,failure
10371,unknown_user,2023-01-25 00:46:00,192.168.1.30,server3,success
10372,unknown_user,2023-01-25 00:47:00,10.0.0.38,server3,failure
10373,unknown_user,2023-01-25 00:48:00,198.51.100.24,server1,success
