In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from datetime import timedelta
import ipaddress
import logging
from matplotlib import font_manager as fm

# 设置中文字体
font_path = "conf/simsun.ttc"
my_font = fm.FontProperties(fname=font_path)

# ------------------------------
# 设置日志
# ------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# ------------------------------
# 工具函数
# ------------------------------
def ip_in_subnet(ip, subnet):
    """
    检查 IP 是否属于指定网段
    """
    try:
        return ipaddress.ip_address(ip) in ipaddress.ip_network(subnet)
    except ValueError:
        return False

# ------------------------------
# 异常规则模块
# ------------------------------
class AnomalyRules:
    def __init__(self):
        """
        初始化异常规则管理
        """
        self.rules = []

    def add_rule(self, name, func, description=""):
        """
        添加新的异常规则
        """
        if not callable(func):
            raise ValueError(f"规则 {name} 的逻辑必须是一个可调用对象（函数）。")
        self.rules.append({"name": name, "func": func, "description": description})
        logging.info(f"规则已添加: {name} - {description}")

    def apply_rules(self, df):
        """
        应用所有规则并生成 '是否异常' 标记
        """
        results = {}
        for rule in self.rules:
            logging.info(f"正在应用规则: {rule['name']} ({rule['description']})")
            results[rule["name"]] = rule["func"](df)

        # 合并规则结果
        anomaly_flags = pd.DataFrame(results)
        df["规则异常"] = anomaly_flags.any(axis=1).astype(int)
        df["异常类型"] = anomaly_flags.apply(
            lambda row: ", ".join([name for name, val in row.items() if val]), axis=1
        )

        # 统计每个规则匹配的数量
        for rule_name, matches in results.items():
            match_count = matches.sum()
            match_ratio = match_count / len(df)
            logging.info(f"规则 '{rule_name}' 匹配数量: {match_count} ({match_ratio:.2%})")

        logging.info("规则应用完成")
        return df

In [2]:
# ------------------------------
# 数据加载与生成
# ------------------------------
def load_login_data(file_path="data/user_login_data.csv"):
    """
    加载指定文件中的用户登录数据，如果文件不存在则生成测试数据并返回
    """
    import os
    if os.path.exists(file_path):
        # 文件存在时，读取数据
        try:
            df = pd.read_csv(file_path, encoding="gbk")
            logging.info(f"成功加载文件: {file_path}")
        except Exception as e:
            logging.error(f"无法加载文件 {file_path}: {e}")
            raise
    else:
        # 文件不存在时，生成模拟测试数据
        logging.warning(f"文件 {file_path} 不存在，生成测试数据")
        user_ids = [f"user{i}" for i in range(1, 101)]
        data = {
            "用户ID": np.random.choice(user_ids, size=5000),
            "登录时间": pd.date_range("2024-11-01", periods=5000, freq="T"),
            "登录地址": np.random.choice(["192.168.1.1", "10.16.0.1", "172.16.0.1", "10.17.0.1", "203.0.113.1"], size=5000),
            "登录资源": np.random.choice(["server1", "server2"], size=5000),
            "是否登录成功": np.random.choice(["success", "failure"], size=5000, p=[0.7, 0.3]),
        }
        df = pd.DataFrame(data)

        # 添加模拟异常
        df.loc[np.random.choice(df.index, 100), "用户ID"] = "unknown_user"
        df.loc[np.random.choice(df.index, 100), "登录地址"] = "203.0.113.255"

        logging.info("模拟登录数据生成完成")
    
    return df

In [16]:
df = load_login_data()
#df.info()
df["登录失败次数"] = df.groupby('用户ID')['是否登录成功'].transform(lambda x: (x == 0).sum())
print(df["登录失败次数"])

  "登录时间": pd.date_range("2024-11-01", periods=5000, freq="T"),
2024-11-23 08:56:45,081 - INFO - 模拟登录数据生成完成


0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    0
4997    0
4998    0
4999    0
Name: 登录失败次数, Length: 5000, dtype: int64


In [17]:
# ------------------------------
# 数据预处理
# ------------------------------
def preprocess_data(df):
    """
    数据清洗和特征提取
    """
    df["登录时间"] = pd.to_datetime(df["登录时间"], errors="coerce")

    # 添加登录是否成功的标志
    df["登录成功"] = df["是否登录成功"].apply(lambda x: 1 if x == "success" else 0)

    # 计算每个用户的登录失败次数、成功次数和成功率
    df["登录失败次数"] = df.groupby("用户ID")["登录成功"].transform(lambda x: (x == 0).sum())
    df["登录成功次数"] = df.groupby("用户ID")["登录成功"].transform(lambda x: (x == 1).sum())
    df["总登录次数"] = df.groupby("用户ID")["登录成功"].transform("count")
    df["登录成功率"] = df["登录成功次数"] / df["总登录次数"]

    # 计算每个用户的平均登录间隔
    df.sort_values(by=["用户ID", "登录时间"], inplace=True)
    df["时间差"] = df.groupby("用户ID")["登录时间"].diff().dt.total_seconds()
    df["平均登录间隔"] = df.groupby("用户ID")["时间差"].transform("mean").fillna(0)

    # 编码用户ID和地址
    encoder_user = LabelEncoder()
    df["用户编码"] = encoder_user.fit_transform(df["用户ID"])
    encoder_address = LabelEncoder()
    df["地址编码"] = encoder_address.fit_transform(df["登录地址"])

    logging.info("数据预处理完成")
    return df

df = preprocess_data(df)
df.info()
df.head(100)

2024-11-23 09:01:18,670 - INFO - 数据预处理完成


<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 42 to 4993
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   用户ID    5000 non-null   object        
 1   登录时间    5000 non-null   datetime64[ns]
 2   登录地址    5000 non-null   object        
 3   登录资源    5000 non-null   object        
 4   是否登录成功  5000 non-null   object        
 5   登录失败次数  5000 non-null   int64         
 6   登录成功    5000 non-null   int64         
 7   登录成功次数  5000 non-null   int64         
 8   总登录次数   5000 non-null   int64         
 9   登录成功率   5000 non-null   float64       
 10  时间差     4899 non-null   float64       
 11  平均登录间隔  5000 non-null   float64       
 12  用户编码    5000 non-null   int64         
 13  地址编码    5000 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(6), object(4)
memory usage: 585.9+ KB


Unnamed: 0,用户ID,登录时间,登录地址,登录资源,是否登录成功,登录失败次数,登录成功,登录成功次数,总登录次数,登录成功率,时间差,平均登录间隔,用户编码,地址编码
42,unknown_user,2024-11-01 00:42:00,10.17.0.1,server1,failure,34,0,65,99,0.656566,,2990.816327,0,1
80,unknown_user,2024-11-01 01:20:00,10.16.0.1,server1,success,34,1,65,99,0.656566,2280.0,2990.816327,0,0
94,unknown_user,2024-11-01 01:34:00,203.0.113.1,server1,failure,34,0,65,99,0.656566,840.0,2990.816327,0,4
124,unknown_user,2024-11-01 02:04:00,192.168.1.1,server1,success,34,1,65,99,0.656566,1800.0,2990.816327,0,3
147,unknown_user,2024-11-01 02:27:00,10.16.0.1,server1,success,34,1,65,99,0.656566,1380.0,2990.816327,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4808,unknown_user,2024-11-04 08:08:00,10.16.0.1,server1,success,34,1,65,99,0.656566,7080.0,2990.816327,0,0
4812,unknown_user,2024-11-04 08:12:00,10.16.0.1,server1,success,34,1,65,99,0.656566,240.0,2990.816327,0,0
4855,unknown_user,2024-11-04 08:55:00,10.17.0.1,server1,success,34,1,65,99,0.656566,2580.0,2990.816327,0,1
4927,unknown_user,2024-11-04 10:07:00,172.16.0.1,server2,success,34,1,65,99,0.656566,4320.0,2990.816327,0,2


In [18]:
# ------------------------------
# 异常登录检测模型构建
# ------------------------------
def build_detection_model(df):
    """
    使用Isolation Forest构建异常检测模型
    """
    features = ["登录失败次数", "登录成功率", "平均登录间隔"]
    X = df[features]

    # 训练Isolation Forest模型
    model = IsolationForest(contamination=0.05, random_state=42)
    model.fit(X)

    # 预测异常分数
    df["模型异常分数"] = model.decision_function(X)
    df["模型异常"] = model.predict(X)
    # -1表示异常，1表示正常

    logging.info("异常检测模型构建完成")
    return model, df

In [19]:
model, df = build_detection_model(df)

2024-11-23 09:02:28,818 - INFO - 异常检测模型构建完成


In [20]:
df.info()
df.head(100)

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 42 to 4993
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   用户ID    5000 non-null   object        
 1   登录时间    5000 non-null   datetime64[ns]
 2   登录地址    5000 non-null   object        
 3   登录资源    5000 non-null   object        
 4   是否登录成功  5000 non-null   object        
 5   登录失败次数  5000 non-null   int64         
 6   登录成功    5000 non-null   int64         
 7   登录成功次数  5000 non-null   int64         
 8   总登录次数   5000 non-null   int64         
 9   登录成功率   5000 non-null   float64       
 10  时间差     4899 non-null   float64       
 11  平均登录间隔  5000 non-null   float64       
 12  用户编码    5000 non-null   int64         
 13  地址编码    5000 non-null   int64         
 14  模型异常分数  5000 non-null   float64       
 15  模型异常    5000 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(7), object(4)
memory usage: 664.1+ KB


Unnamed: 0,用户ID,登录时间,登录地址,登录资源,是否登录成功,登录失败次数,登录成功,登录成功次数,总登录次数,登录成功率,时间差,平均登录间隔,用户编码,地址编码,模型异常分数,模型异常
42,unknown_user,2024-11-01 00:42:00,10.17.0.1,server1,failure,34,0,65,99,0.656566,,2990.816327,0,1,-0.074919,-1
80,unknown_user,2024-11-01 01:20:00,10.16.0.1,server1,success,34,1,65,99,0.656566,2280.0,2990.816327,0,0,-0.074919,-1
94,unknown_user,2024-11-01 01:34:00,203.0.113.1,server1,failure,34,0,65,99,0.656566,840.0,2990.816327,0,4,-0.074919,-1
124,unknown_user,2024-11-01 02:04:00,192.168.1.1,server1,success,34,1,65,99,0.656566,1800.0,2990.816327,0,3,-0.074919,-1
147,unknown_user,2024-11-01 02:27:00,10.16.0.1,server1,success,34,1,65,99,0.656566,1380.0,2990.816327,0,0,-0.074919,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4808,unknown_user,2024-11-04 08:08:00,10.16.0.1,server1,success,34,1,65,99,0.656566,7080.0,2990.816327,0,0,-0.074919,-1
4812,unknown_user,2024-11-04 08:12:00,10.16.0.1,server1,success,34,1,65,99,0.656566,240.0,2990.816327,0,0,-0.074919,-1
4855,unknown_user,2024-11-04 08:55:00,10.17.0.1,server1,success,34,1,65,99,0.656566,2580.0,2990.816327,0,1,-0.074919,-1
4927,unknown_user,2024-11-04 10:07:00,172.16.0.1,server2,success,34,1,65,99,0.656566,4320.0,2990.816327,0,2,-0.074919,-1
