In [1]:
import pandas as pd
import os

# 读取csv文件
file_path = 'data/clean_data_actors.csv'
df = pd.read_csv(file_path)

# 获取每份的大小
num_splits = 20
split_size = len(df) // num_splits

# 创建输出文件夹
output_dir = 'data/splits'
os.makedirs(output_dir, exist_ok=True)

# 将数据分成20份并保存
for i in range(num_splits):
    start_idx = i * split_size
    end_idx = (i + 1) * split_size if i != num_splits - 1 else len(df)

    split_df = df.iloc[start_idx:end_idx]
    output_file = os.path.join(output_dir, f'clean_data_actors_part_{i+1}.csv')

    # 保存每一份数据
    split_df.to_csv(output_file, index=False)

print(f'Successfully split the file into {num_splits} parts.')


Successfully split the file into 20 parts.


In [None]:
import os
import pandas as pd

# 定义需要合并的文件夹路径
folders = ['results', 'results_2', 'results_3', 'results_4', 'results_5']

# 用于存储所有数据的列表
all_data = []

# 遍历所有文件夹
for folder in folders:
    # 获取该文件夹中的所有CSV文件
    folder_path = os.path.join('rabbit/data', folder)
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # 读取每个CSV文件并添加到all_data列表中
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        all_data.append(df)

# 将所有数据合并为一个DataFrame
merged_df = pd.concat(all_data, ignore_index=True)

# 保存合并后的数据
output_file = 'rabbit/data/merged_results.csv'
merged_df.to_csv(output_file, index=False)

print(f'All CSV files merged into {output_file}')


In [None]:
import pandas as pd
import os

# 读取 filtered_clean_data_actors.csv 文件
file_path = 'rabbit/data/filtered_clean_data_actors.csv'
df = pd.read_csv(file_path)

# 获取每份的大小
num_splits = 10
split_size = len(df) // num_splits

# 创建输出文件夹
output_dir = 'rabbit/data/splits'
os.makedirs(output_dir, exist_ok=True)

# 将数据分成10份并保存
for i in range(num_splits):
    start_idx = i * split_size
    end_idx = (i + 1) * split_size if i != num_splits - 1 else len(df)

    split_df = df.iloc[start_idx:end_idx]
    output_file = os.path.join(output_dir, f'filtered_clean_data_actors_part_{i+1}.csv')

    # 保存每一份数据
    split_df.to_csv(output_file, index=False)

print(f'Successfully split the file into {num_splits} parts.')


In [6]:
import os
import pandas as pd

# 定义需要合并的文件夹路径
folders = ['results', 'results_2', 'results_3', 'results_4', 'results_5', 'result_3-4', 'result_4-5']

# 用于存储所有数据的列表
all_data = []

# 遍历所有文件夹
for folder in folders:
    # 获取该文件夹中的所有CSV文件
    folder_path = os.path.join(folder)
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # 读取每个CSV文件并添加到all_data列表中
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        all_data.append(df)

# 将所有数据合并为一个DataFrame
merged_df = pd.concat(all_data, ignore_index=True)

# 保存合并后的数据
output_file = 'data/merged_results.csv'
merged_df.to_csv(output_file, index=False)

print(f'All CSV files merged into {output_file}')


All CSV files merged into data/merged_results.csv


In [7]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 读取 merged_results.csv 和 merged_output.csv 文件
merged_results_path = 'data/merged_results.csv'
merged_output_path = 'data/merged_output.csv'

merged_results_df = pd.read_csv(merged_results_path)
merged_output_df = pd.read_csv(merged_output_path)

# 合并两个数据集，基于 merged_results 的 contributor 和 merged_output 的 clean_login 字段
merged_df = pd.merge(merged_output_df, merged_results_df, left_on='clean_login', right_on='contributor', how='inner')

# 保留 'data_label' 和 'type' 列中值为 'Bot', 'Human', 'Unknown' 的数据
valid_labels = ['Bot', 'Human', 'Unknown']
filtered_df = merged_df[(merged_df['data_label'].isin(valid_labels)) & (merged_df['type'].isin(valid_labels))]

# 映射标签为数值，'Bot' -> 1, 'Human' -> 0，忽略 'Unknown'
label_mapping = {'Bot': 1, 'Human': 0}
filtered_df['data_label'] = filtered_df['data_label'].map(label_mapping)
filtered_df['type'] = filtered_df['type'].map(label_mapping)

# 删除 'Unknown' 值
filtered_df = filtered_df.dropna(subset=['data_label', 'type'])

# 获取预测值和真实值
y_true = filtered_df['data_label']
y_pred = filtered_df['type']

# 计算 Accuracy, Precision, Recall, F1 Score, AUC
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')
auc = roc_auc_score(y_true, y_pred)

# 打印结果
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC: {auc}")


Accuracy: 0.9341473596326445
Precision: 0.2536407766990291
Recall: 0.9288888888888889
F1 Score: 0.3984747378455672
AUC: 0.9315813472979232


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['data_label'] = filtered_df['data_label'].map(label_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['type'] = filtered_df['type'].map(label_mapping)
