## 统计各个指标下面的仓库数量
issues_closed: 71726 files
change_requests_reviews: 59783 files
participants: 73837 files
attention: 73427 files
issue_resolution_duration: 71665 files
technical_fork: 73390 files
issue_response_time: 72213 files
activity: 73842 files
issue_age: 72871 files
change_request_age: 74382 files
stars: 73255 files
issues_new: 72214 files
change_request_resolution_duration: 73474 files
new_contributors: 71181 files
change_requests_accepted: 71662 files
change_request_response_time: 73105 files
bus_factor: 73757 files
code_change_lines_add: 73734 files
code_change_lines_remove: 73727 files
code_change_lines_sum: 73734 files
change_requests: 73736 files
inactive_contributors: 71438 files
issue_comments: 73781 files


In [None]:
import os

def count_files_in_folders(directory):
    folder_counts = {}

    for folder_name in os.listdir(directory):
        folder_path = os.path.join(directory, folder_name)

        if os.path.isdir(folder_path):
            file_count = len([name for name in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, name))])
            folder_counts[folder_name] = file_count

    return folder_counts

def main():
    data_dir = 'data/repo_data'

    if os.path.exists(data_dir):
        folder_file_counts = count_files_in_folders(data_dir)

        for folder, count in folder_file_counts.items():
            print(f"{folder}: {count} files")
    else:
        print(f"Directory {data_dir} does not exist.")

if __name__ == '__main__':
    main()

### Convert JSON format to CSV format

In [None]:
import os
import json
import pandas as pd

def extract_data_from_json(repo_id, json_file_path, is_avg_structure=False):
    """从 JSON 文件中提取 yyyy-MM 格式的数据，返回字典格式的数据"""
    # Check if the file is empty
    if os.path.getsize(json_file_path) == 0:
        print(f"Empty file: {json_file_path}")
        return None

    try:
        with open(json_file_path, 'r') as f:
            data = json.load(f)
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {json_file_path}")
        return None

    if is_avg_structure:
        # If it is an "avg" structure, only extract key-value pairs in the format of yyyy-MM under avg.
        if 'avg' in data:
            filtered_data = {k: v for k, v in data['avg'].items() if len(k) == 7 and k[4] == '-'}
        else:
            print(f"No 'avg' key found in {json_file_path}")
            return None
    else:
        # therwise, extract the key-value pairs in the root directory in the yyyy-MM format
        filtered_data = {k: v for k, v in data.items() if len(k) == 7 and k[4] == '-'}

    if not filtered_data:
        print(f"No valid yyyy-MM data found in {json_file_path}")
        return None

    filtered_data['repo_id'] = repo_id

    return filtered_data

def process_and_merge_files(input_dir, is_avg_structure=False):
    """处理文件夹中的所有 JSON 文件，并将它们合并为一个 DataFrame"""
    all_data = []

    for file_name in os.listdir(input_dir):
        if file_name.endswith('.json'):
            repo_id = os.path.splitext(file_name)[0]  # 从文件名中获取 repo_id
            json_file_path = os.path.join(input_dir, file_name)

            data = extract_data_from_json(repo_id, json_file_path, is_avg_structure)
            if data:
                all_data.append(data)

    df = pd.DataFrame(all_data)

    if not df.empty:
        time_columns = sorted([col for col in df.columns if col != 'repo_id'])

        df = df[['repo_id'] + time_columns]

    return df

def process_all_folders(base_dir, output_dir):
    """处理 repo_data 目录下的所有子文件夹，并将结果保存为 CSV 文件"""
    avg_structure_folders = [
        "change_request_age", "change_request_resolution_duration",
        "change_request_response_time", "issue_age", "issue_response_time",
        "issue_resolution_duration"
    ]

    for folder in os.listdir(base_dir):
        input_dir = os.path.join(base_dir, folder)
        output_file = os.path.join(output_dir, f"{folder}_merged.csv")

        if os.path.exists(input_dir):
            # Determine if it is a folder structure of the avg.
            is_avg_structure = folder in avg_structure_folders

            # Process and merge the JSON files in this folder.
            merged_df = process_and_merge_files(input_dir, is_avg_structure)

            if not merged_df.empty:
                merged_df.to_csv(output_file, index=False)
                print(f"Merged data saved to {output_file}")
            else:
                print(f"No data to merge in folder: {folder}")
        else:
            print(f"Folder not found: {input_dir}")

# 主函数
def main():
    base_dir = 'data/repo_data_final'
    output_dir = 'data/repo_analysis_final'

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Process all JSON files in the folders and merge them into a CSV file
    process_all_folders(base_dir, output_dir)

if __name__ == '__main__':
    main()


### Count the rows of each metric file.
File: code_change_lines_sum_merged.csv, Rows: 72923
File: code_change_lines_add_merged.csv, Rows: 73306
File: technical_fork_merged.csv, Rows: 72787
File: issue_resolution_duration_merged.csv, Rows: 71399
File: bus_factor_merged.csv, Rows: 73213
File: change_requests_reviews_merged.csv, Rows: 59613
File: issue_age_merged.csv, Rows: 70541
File: participants_merged.csv, Rows: 73288
File: change_request_response_time_merged.csv, Rows: 72838
File: stars_merged.csv, Rows: 72570
File: change_request_age_merged.csv, Rows: 70840
File: issue_response_time_merged.csv, Rows: 71958
File: change_requests_merged.csv, Rows: 73350
File: new_contributors_merged.csv, Rows: 70939
File: code_change_lines_remove_merged.csv, Rows: 73207
File: attention_merged.csv, Rows: 72703
File: issues_closed_merged.csv, Rows: 71355
File: change_request_resolution_duration_merged.csv, Rows: 73184
File: inactive_contributors_merged.csv, Rows: 64250
File: activity_merged.csv, Rows: 73816
File: issue_comments_merged.csv, Rows: 73302
File: issues_new_merged.csv, Rows: 71798
File: change_requests_accepted_merged.csv, Rows: 71330


In [None]:
import os
import pandas as pd

def count_rows_in_csv_files(directory):
    """遍历目录中的所有CSV文件，统计每个文件中的行数"""
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory, file_name)
            try:
                # 读取CSV文件
                df = pd.read_csv(file_path)
                # 统计行数（包括表头）
                row_count = df.shape[0]
                print(f"File: {file_name}, Rows: {row_count}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")

# 设置目录路径
directory = 'data/repo_analysis_final'

# 调用函数获取每个CSV文件的数据数量
count_rows_in_csv_files(directory)


## Merge two sets of tag data and take the intersection.

In [None]:
import pandas as pd
import os

# 文件夹路径，仅使用bot_v2和bot_v3
folders = ['data/bot_label_result_1/category', 'data/bot_label_result_2/category']
output_folder = 'data/final/actor_category'

# 要比较的文件列表
files = [
    "CI_CD_Bots.csv",
    "Code_Review_Bots.csv",
    "Code_Security_Review_Bots.csv",
    "Collaboration_and_Communication_Bots.csv",
    "Configuration_Management_Bots.csv",
    "Documentation_Generation_Bots.csv",
    "Open_Source_Compliance_Inspection_Bots.csv",
    "Periodic_Report_Bots.csv",
    "Workflow_Control_Bots.csv"
]

# 函数用于加载CSV并提取actor_id列
def load_actor_ids(filepath):
    df = pd.read_csv(filepath)
    return set(df['actor_id'])

# 函数用于比较bot_v3与bot_v2的交集并保存到CSV文件
def merge_intersections(folders, files, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file in files:
        # 定义两个文件夹的路径
        bot_v3_path = os.path.join(folders[0], file)
        bot_v2_path = os.path.join(folders[1], file)

        # 确保文件存在
        if all(os.path.exists(p) for p in [bot_v3_path, bot_v2_path]):
            # 加载actor_id
            bot_v3_actors = load_actor_ids(bot_v3_path)
            bot_v2_actors = load_actor_ids(bot_v2_path)

            # 计算交集：bot_v3 与 bot_v2
            intersection_v3_v2 = bot_v3_actors.intersection(bot_v2_actors)

            # 保存交集到新的CSV文件
            output_file = os.path.join(output_folder, file)
            pd.DataFrame(list(intersection_v3_v2), columns=['actor_id']).to_csv(output_file, index=False)
            print(f"Saved merged actor_id to {output_file}")

# 运行函数，合并交集并输出结果
merge_intersections(folders, files, output_folder)


## Count the final number of labels.

In [1]:
import pandas as pd
import os

# 定义文件夹路径和文件名
folder_path = 'data/final/actor_category'
files = [
    "CI_CD_Bots.csv",
    "Code_Review_Bots.csv",
    "Code_Security_Review_Bots.csv",
    "Collaboration_and_Communication_Bots.csv",
    "Documentation_Generation_Bots.csv",
    "Open_Source_Compliance_Inspection_Bots.csv",
    "Periodic_Report_Bots.csv",
    "Workflow_Control_Bots.csv"
]
# 初始化字典以存储actor_id数量
actor_counts = {}

# 遍历每个文件，统计actor_id列中的唯一数量
for file in files:
    file_path = os.path.join(folder_path, file)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        actor_counts[file] = df['actor_id'].nunique()

# 输出统计结果
print(actor_counts)


{'CI_CD_Bots.csv': 551, 'Code_Review_Bots.csv': 406, 'Code_Security_Review_Bots.csv': 44, 'Collaboration_and_Communication_Bots.csv': 77, 'Documentation_Generation_Bots.csv': 52, 'Open_Source_Compliance_Inspection_Bots.csv': 37, 'Periodic_Report_Bots.csv': 22, 'Workflow_Control_Bots.csv': 36}
