In [4]:
import hashlib
import pandas as pd
import numpy as np
import random
import uuid

In [3]:

def hash_email(email):
    """ 使用SHA-256散列函数对邮箱进行散列 """
    return hashlib.sha256(email.encode()).hexdigest()

def preprocess_user_data(department_info_path, psychometric_path, output_path):
    # 加载数据
    department_info = pd.read_csv(department_info_path)
    psychometric = pd.read_csv(psychometric_path)

    # 确保所有字段都存在并且是字符串类型
    string_fields = ['employee_name', 'email', 'role', 'department', 'team', 'supervisor', 'business_unit']
    for field in string_fields:
        if field in department_info.columns:
            department_info[field] = department_info[field].astype(str).str.strip()

    department_info['user_type'] = 'Employee'  # 假设所有用户都是员工

    # 处理 psychometric 数据
    psychometric.columns = ['employee_name', 'user_id', 'o_score', 'c_score', 'e_score', 'a_score', 'n_score']
    psychometric = psychometric.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    # 合并数据
    merged_data = pd.merge(department_info, psychometric, on='employee_name', how='left')
    merged_data.fillna({'o_score': 0, 'c_score': 0, 'e_score': 0, 'a_score': 0, 'n_score': 0}, inplace=True)

    # 添加日期字段
    merged_data['join_date'] = pd.to_datetime('2021-01-01')  # 使用固定日期作为示例
    merged_data['last_active'] = pd.to_datetime('now')

    # 生成 user_id
    merged_data['user_id'] = merged_data.apply(lambda x: hash_email(x['email']), axis=1)

    # 导出数据到指定的输出路径
    output_file = output_path + '/processed_user_data.csv'
    merged_data.to_csv(output_file, index=False)
    print(f"Data processed and saved to {output_file}")

# 设置文件路径
department_info_path = 'D:/Pycharm_pro/demo/pythonProject/data/DepartmentInfo.csv'
psychometric_path = 'D:/Pycharm_pro/demo/pythonProject/data/psychometric.csv'
output_path = 'D:/Pycharm_pro/demo/pythonProject/pro_data'

# 处理数据
preprocess_user_data(department_info_path, psychometric_path, output_path)


Data processed and saved to D:/Pycharm_pro/demo/pythonProject/pro_data/processed_user_data.csv


In [8]:
user_data_path = 'D:/Pycharm_pro/demo/pythonProject/pro_data/processed_user_data.csv'
department_data_path = 'D:/Pycharm_pro/demo/pythonProject/data/DepartmentInfo.csv'

# 加载数据
user_df = pd.read_csv(user_data_path)
department_df = pd.read_csv(department_data_path)

# 提取user_id
user_ids = user_df['user_id'].unique()

# 生成用户关系
def generate_user_relations(user_ids, num_relations=100):
    relations = []
    for _ in range(num_relations):
        users_pair = np.random.choice(user_ids, size=2, replace=False)
        relation_id = str(uuid.uuid4())
        relation_type = np.random.choice(['colleague', 'supervisor', 'subordinate'])
        strength = np.random.uniform(0, 1)  # 随机生成关系强度
        
        relations.append({
            'relation_id': relation_id,
            'user_id_1': users_pair[0],
            'user_id_2': users_pair[1],
            'relation_type': relation_type,
            'strength': strength
        })
    return pd.DataFrame(relations)

# 生成并保存用户关系
user_relations_df = generate_user_relations(user_ids)
user_relations_df.to_csv('D:/Pycharm_pro/demo/pythonProject/pro_data/user_relations.csv', index=False)


In [31]:
import pandas as pd
import uuid

# 文件路径
device_path = 'D:/Pycharm_pro/demo/pythonProject/data/device.csv'
email_path = 'D:/Pycharm_pro/demo/pythonProject/data/email.csv'
file_path = 'D:/Pycharm_pro/demo/pythonProject/data/file.csv'
http_path = 'D:/Pycharm_pro/demo/pythonProject/data/http.csv'
logon_path = 'D:/Pycharm_pro/demo/pythonProject/data/logon.csv'
output_path = 'D:/Pycharm_pro/demo/pythonProject/pro_data/activity_data.csv'

date_format = "%Y/%m/%d %H:%M" # 假设的日期时间格式
data_frames = []
for path in [device_path, email_path, file_path, http_path, logon_path]:
    df = pd.read_csv(path)
    df['date'] = pd.to_datetime(df['date'], format=date_format, errors='coerce') # 将错误的日期转为NaT
    data_frames.append(df)

# 加载数据
device_df = pd.read_csv(device_path)
email_df = pd.read_csv(email_path)
file_df = pd.read_csv(file_path)
http_df = pd.read_csv(http_path)
logon_df = pd.read_csv(logon_path)

# 重命名 'pc' 列为 'pc_id'
for df in [device_df, email_df, file_df, http_df, logon_df]:
    if 'pc' in df.columns:
        df.rename(columns={'pc': 'pc_id'}, inplace=True)

# 为每个DataFrame添加 'user_id' 列（如果不存在）
for df in [email_df, file_df, http_df, logon_df, device_df]:
    if 'user' in df.columns:
        df.rename(columns={'user': 'user_id'}, inplace=True)

# 创建 pc_id 到 user_id 的映射并更新 user_id
pc_to_user = device_df.set_index('pc_id')['user_id'].to_dict()
for df in [email_df, file_df, http_df, logon_df]:
    df['user_id'] = df['pc_id'].map(pc_to_user).fillna(df['user_id'])

# 确保所有DataFrame都有必需的列
default_columns = ['user_id', 'pc_id', 'device_type', 'location']
defaults = {'user_id': 'unknown', 'pc_id': 'unknown', 'device_type': 'unknown', 'location': 'unknown'}

data_frames = [device_df, email_df, file_df, http_df, logon_df]
for df in data_frames:
    for col, default in defaults.items():
        if col not in df.columns:
            df[col] = default  # 添加缺失列并使用默认值填充
            
# 为每个DataFrame指定活动类型
device_df['activity_type'] = 'device'
email_df['activity_type'] = 'email'
file_df['activity_type'] = 'file'
http_df['activity_type'] = 'http'
logon_df['activity_type'] = 'logon'

# 合并数据
all_data = pd.concat([device_df, email_df, file_df, http_df, logon_df], ignore_index=True)

# 添加其他必要字段
all_data['activity_id'] = [str(uuid.uuid4()) for _ in range(len(all_data))]
all_data['datetime'] = pd.to_datetime(all_data['date'])  # 假设所有数据文件都有 'date' 字段
all_data['action_details'] = 'Details about the action'
all_data['risk_level'] = 'Low'
all_data['correlated_id'] = all_data['activity_id']
all_data['target_user_id'] = all_data['user_id']

# 准备最终的数据表格
activity_columns = [
    'activity_id', 'datetime', 'user_id', 'pc_id', 'device_type', 
    'location', 'activity_type', 'action_details', 'risk_level', 
    'correlated_id', 'target_user_id'
]
activity_df = all_data[activity_columns]

# 保存到CSV文件
activity_df.to_csv(output_path, index=False)
print("Activity data processed and saved successfully.")


Activity data processed and saved successfully.


In [49]:
import pandas as pd
import uuid

# 定义文件路径
file_paths = {
    'device': 'D:/Pycharm_pro/demo/pythonProject/data/device.csv',
    'email': 'D:/Pycharm_pro/demo/pythonProject/data/email.csv',
    'file': 'D:/Pycharm_pro/demo/pythonProject/data/file.csv',
    'http': 'D:/Pycharm_pro/demo/pythonProject/data/http.csv',
    'logon': 'D:/Pycharm_pro/demo/pythonProject/data/logon.csv'
}
output_path = 'D:/Pycharm_pro/demo/pythonProject/pro_data/activity.csv'

# 需要确保存在的列
required_columns = ['user_id', 'pc_id', 'device_type', 'location', 'activity_type']

# 加载和处理数据
data_frames = []
for key, path in file_paths.items():
    df = pd.read_csv(path)
    if 'pc' in df.columns:
        df.rename(columns={'pc': 'pc_id'}, inplace=True)
    if 'user' in df.columns:
        df.rename(columns={'user': 'user_id'}, inplace=True)
    
    # 设置缺失的列和默认值
    for column in required_columns:
        if column not in df.columns:
            df[column] = 'unknown'  # 使用 'unknown' 或其他适当的默认值填充
    
    df['user_id'] = df.get('user_id', 'unknown')  # Set default user_id if missing
    df['activity_type'] = key
    df['datetime'] = pd.to_datetime(df['date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
    data_frames.append(df)

# 合并所有数据帧
all_data = pd.concat(data_frames, ignore_index=True)

# 添加额外必要的字段
all_data['activity_id'] = [str(uuid.uuid4()) for _ in range(len(all_data))]
all_data['action_details'] = 'Details about the action'
all_data['risk_level'] = 'Low'
all_data['correlated_id'] = all_data['activity_id']
all_data['target_user_id'] = all_data['user_id']

# 定义最终数据帧的列并保存到 CSV
activity_columns = [
    'activity_id', 'datetime', 'user_id', 'pc_id', 'device_type',
    'location', 'activity_type', 'action_details', 'risk_level',
    'correlated_id', 'target_user_id'
]
try:
    activity_df = all_data[activity_columns]
    activity_df.to_csv(output_path, index=False)
    print("Activity data processed and saved successfully.")
except Exception as e:
    print(f"Failed to save data: {e}")


Activity data processed and saved successfully.
