In [None]:
import pandas as pd

def load_data():
    user_activity_df = pd.read_csv(r'D:\dataset\cmu-cert r4.2\demo\device.csv')
    email_activity_df = pd.read_csv(r'D:\dataset\cmu-cert r4.2\demo\email.csv')
    file_activity_df = pd.read_csv(r'D:\dataset\cmu-cert r4.2\demo\file.csv')
    http_activity_df = pd.read_csv(r'D:\dataset\cmu-cert r4.2\demo\http.csv')
    department_info_df = pd.read_csv(r'D:\dataset\cmu-cert r4.2\demo\DepartmentInfo.csv')
    psychometric_df = pd.read_csv(r'D:\dataset\cmu-cert r4.2\demo\psychometric.csv')
    return (user_activity_df, email_activity_df, file_activity_df, http_activity_df, department_info_df, psychometric_df)

def clean_data(dfs):
    # Replace 'None' string with NaN and drop NaN values
    for df in dfs:
        df.replace('None', pd.NA, inplace=True)
        df.dropna(inplace=True)
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
        df.columns = map(str.lower, df.columns)
    return dfs

# 加载和清洗数据
dfs = load_data()
(user_activity_df, email_activity_df, file_activity_df, http_activity_df, department_info_df, psychometric_df) = clean_data(dfs)


In [None]:
def find_inconsistent_user_ids(dfs):
    department_info_df = dfs[-2]
    user_ids = set(department_info_df['user_id'])
    inconsistent_user_ids = {}
    for df_name, df in zip(['user_activity_df', 'email_activity_df', 'file_activity_df', 'http_activity_df', 'psychometric_df'],
                           dfs[:-1]):
        df_key = 'user_id' if 'user_id' in df.columns else 'user'
        inconsistent_user_ids[df_name] = df[~df[df_key].isin(user_ids)][df_key].unique().tolist()
    return inconsistent_user_ids

# 查找不一致的用户ID
inconsistent_user_ids = find_inconsistent_user_ids(dfs)


In [None]:
def extract_triples(dfs):
    (user_activity_df, email_activity_df, file_activity_df, http_activity_df, department_info_df, psychometric_df) = dfs
    triples = []
    def add_triple(source, relation, target):
        if pd.notna(source) and pd.notna(target):
            triples.append((source, relation, target))
    
    if 'user' in user_activity_df.columns:
        for _, row in user_activity_df.iterrows():
            add_triple(row['user'], '进行活动', row.get('activity', 'Unknown'))
            add_triple(row['user'], '使用设备', row.get('pc', 'Unknown'))
            if 'date' in row:
                add_triple(row.get('activity', 'Unknown'), '发生在', row['date'])
    
    if 'user' in email_activity_df.columns:
        for _, row in email_activity_df.iterrows():
            add_triple(row['user'], '发送邮件到', row.get('to', 'Unknown'))
            add_triple(row['user'], '使用设备', row.get('pc', 'Unknown'))
            add_triple(row.get('id', 'Unknown'), '邮件大小', row.get('size', 'Unknown'))
            add_triple(row.get('id', 'Unknown'), '附件数量', row.get('attachments', 'Unknown'))
            add_triple(row.get('id', 'Unknown'), '邮件内容', row.get('content', 'Unknown'))
    
    if 'user' in file_activity_df.columns:
        for _, row in file_activity_df.iterrows():
            add_triple(row['user'], '访问文件', row.get('filename', 'Unknown'))
            add_triple(row.get('filename', 'Unknown'), '文件内容', row.get('content', 'Unknown'))
    
    if 'user' in http_activity_df.columns:
        for _, row in http_activity_df.iterrows():
            add_triple(row['user'], '访问网址', row.get('url', 'Unknown'))
            add_triple(row.get('url', 'Unknown'), '网址内容', row.get('content', 'Unknown'))
    
    if 'user_id' in department_info_df.columns:
        for _, row in department_info_df.iterrows():
            add_triple(row['user_id'], '属于部门', row.get('department', 'Unknown'))
            add_triple(row['user_id'], '角色是', row.get('role', 'Unknown'))
            add_triple(row['user_id'], '在业务单元', row.get('business_unit', 'Unknown'))
            add_triple(row['user_id'], '在功能单元', row.get('functional_unit', 'Unknown'))
    
    if 'user_id' in psychometric_df.columns:
        for _, row in psychometric_df.iterrows():
            add_triple(row['user_id'], '开放性得分', row.get('o', 'Unknown'))
            add_triple(row['user_id'], '尽责性得分', row.get('c', 'Unknown'))
            add_triple(row['user_id'], '外向性得分', row.get('e', 'Unknown'))
            add_triple(row['user_id'], '宜人性得分', row.get('a', 'Unknown'))
            add_triple(row['user_id'], '神经质得分', row.get('n', 'Unknown'))
    
    return triples

# 提取三元组
triples = extract_triples(dfs)


In [None]:
import networkx as nx

def build_graph(triples):
    G = nx.DiGraph()
    for triple in triples:
        G.add_edge(triple[0], triple[2], label=triple[1], weight=1.0)
    for u, v, d in G.edges(data=True):
        try:
            d['weight'] = float(d.get('weight', 1.0))
        except ValueError:
            d['weight'] = 1.0  # Default weight
    return G

def filter_graph(G):
    filtered_nodes = {n for n in G if G.degree(n) > 5}
    frozen_subgraph = nx.subgraph(G, filtered_nodes)
    filtered_G = nx.DiGraph(frozen_subgraph)
    isolated = list(nx.isolates(filtered_G))
    filtered_G.remove_nodes_from(isolated)
    return filtered_G

# 构建图并过滤
G = build_graph(triples)
filtered_G = filter_graph(G)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from networkx.algorithms.community import greedy_modularity_communities

def detect_communities(G):
    return list(greedy_modularity_communities(G, weight='weight'))

def visualize_graph(G, communities):
    community_map = {node: i for i, community in enumerate(communities) for node in community}
    
    font_path = 'C:/Windows/Fonts/simhei.ttf'
    font_prop = fm.FontProperties(fname=font_path)
    plt.rcParams['font.family'] = font_prop.get_name()

    plt.figure(figsize=(15, 10))
    pos = nx.spring_layout(G, k=0.3)
    node_color = [community_map[node] for node in G.nodes()]
    nx.draw(G, pos, with_labels=True, node_size=700, node_color=node_color, cmap=plt.cm.rainbow,
            font_size=10, font_weight="bold", arrowsize=15, font_family=font_prop.get_name())
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8, font_family=font_prop.get_name())
    plt.title("三元组网络图 - 社区检测", fontproperties=font_prop)
    plt.show()

# 检测社区并可视化
communities = detect_communities(filtered_G)
visualize_graph(filtered_G, communities)
