In [1]:
import os
import json

# Folder containing the JSON files
folder_path = "./"  # Update to your folder path
output_file = "Fraud24kRaw.json"

# Initialize an empty list to store data from all JSON files
combined_data = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):  # Process only JSON files
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, "r", encoding="utf-8") as json_file:
            data = json.load(json_file)  # Load the content of the JSON file
            # Check if the data is a list or a single object
            if isinstance(data, list):
                combined_data.extend(data)  # Append list data
            else:
                combined_data.append(data)  # Append single object

# Reorder index (assign new unique IDs sequentially)
for idx, item in enumerate(combined_data):
    item['id'] = idx  # Reassign ID to be sequential starting from 0

# Save the combined data to a single JSON file
with open(output_file, "w", encoding="utf-8") as out_file:
    json.dump(combined_data, out_file, indent=4, ensure_ascii=False)

print(f"All JSON files have been combined and reindexed into {output_file}")


All JSON files have been combined and reindexed into Fraud24kRaw.json


In [28]:
from tabulate import tabulate  # Install this package: pip install tabulate
import os
import json
import pandas as pd


with open("./FP-base-origin/FP-base-English.json", "r", encoding="utf-8") as file:
    data = json.load(file)
# Create a DataFrame for statistical analysis
df = pd.DataFrame(data)

total_items = len(df)
print("**********************************************************")
print(f"Total number of items in the dataset: {total_items}")
print("**********************************************************")

# Count for each 'data_type'
data_type_counts = df['data_type'].value_counts()
print("**********************************************************")
print("Counts for each 'data_type':")
print(tabulate(data_type_counts.reset_index(), headers=['Data Type', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'subcategory' under each 'category'
subcategory_counts = df.groupby(['category', 'subcategory']).size().reset_index(name='Count')
print("Counts for each 'subcategory' under each 'category':")
print(tabulate(subcategory_counts, headers=['Category', 'Subcategory', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'category'
category_counts = df['category'].value_counts()
print("Counts for each 'category':")
print(tabulate(category_counts.reset_index(), headers=['Category', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'language' (if applicable)
if 'language' in df.columns:
    language_counts = df['language'].value_counts()
    print("Counts for each 'language':")
    print(tabulate(language_counts.reset_index(), headers=['Language', 'Count'], tablefmt='grid'))
    print("**********************************************************")



**********************************************************
Total number of items in the dataset: 1071
**********************************************************
**********************************************************
Counts for each 'data_type':
+----+-------------+---------+
|    | Data Type   |   Count |
|  0 | message     |     685 |
+----+-------------+---------+
|  1 | email       |     236 |
+----+-------------+---------+
|  2 | job posting |     150 |
+----+-------------+---------+
**********************************************************
Counts for each 'subcategory' under each 'category':
+----+--------------------+------------------------------------------------------------------+---------+
|    | Category           | Subcategory                                                      |   Count |
|  0 | fake job posting   | fake job posting                                                 |     150 |
+----+--------------------+-------------------------------------------------

In [8]:
import json
import glob
import uuid

# File pattern to match JSON files
file_pattern = "./FP-levelup-full/FP-levelup-English.json"

# Keys to add with their default values
fixed_keys = {
    "language": "English"
}

# List to store the modified data
combined_data = []
id = 987
# Process each file
for filename in glob.glob(file_pattern):
    with open(filename, "r", encoding="utf-8") as file:
        data = json.load(file)
        for item in data:
            
            # Add the fixed keys
            for key, value in fixed_keys.items():
                if key not in item:
                    item[key] = value
            
            # Append the modified item to the combined data
            combined_data.append(item)

# Save the modified and combined data to a new file
with open("./FP-levelup-full/FP-levelup-English.json", "w", encoding="utf-8") as output_file:
    json.dump(combined_data, output_file, ensure_ascii=False, indent=4)

print("JSON files have been processed and saved to 'combined_and_modified_data.json'!")


JSON files have been processed and saved to 'combined_and_modified_data.json'!


In [10]:
import json
import os

def split_json(input_file, output_prefix="output", num_splits=10):
    # 读取 JSON 数据
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # 确保数据是列表
    if not isinstance(data, list):
        raise ValueError("JSON 数据必须是一个列表")

    total = len(data)
    chunk_size = total // num_splits  # 每个文件的基础大小
    remainder = total % num_splits  # 处理不能整除的情况

    start = 0
    for i in range(num_splits):
        end = start + chunk_size + (1 if i < remainder else 0)  # 分配数据
        split_data = data[start:end]
        start = end  # 更新下一个起点
        
        output_file = f"{output_prefix}_{i+1}.json"
        with open(output_file, "w", encoding="utf-8") as f_out:
            json.dump(split_data, f_out, ensure_ascii=False, indent=4)
        
        print(f"Saved {output_file} with {len(split_data)} records.")

# 示例：拆分 'data.json' 文件
split_json("./FP-levelup-full/FP-levelup-Chinese.json", "./FP-levelup-split/FP-levelup-Chinese")


Saved ./FP-levelup-split/FP-levelup-Chinese_1.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_2.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_3.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_4.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_5.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_6.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_7.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_8.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_9.json with 107 records.
Saved ./FP-levelup-split/FP-levelup-Chinese_10.json with 107 records.
