In [1]:
import os
import json

# Folder containing the JSON files
folder_path = "./"  # Update to your folder path
output_file = "Fraud24kRaw.json"

# Initialize an empty list to store data from all JSON files
combined_data = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):  # Process only JSON files
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, "r", encoding="utf-8") as json_file:
            data = json.load(json_file)  # Load the content of the JSON file
            # Check if the data is a list or a single object
            if isinstance(data, list):
                combined_data.extend(data)  # Append list data
            else:
                combined_data.append(data)  # Append single object

# Reorder index (assign new unique IDs sequentially)
for idx, item in enumerate(combined_data):
    item['id'] = idx  # Reassign ID to be sequential starting from 0

# Save the combined data to a single JSON file
with open(output_file, "w", encoding="utf-8") as out_file:
    json.dump(combined_data, out_file, indent=4, ensure_ascii=False)

print(f"All JSON files have been combined and reindexed into {output_file}")


All JSON files have been combined and reindexed into Fraud24kRaw.json


In [3]:
from tabulate import tabulate  # Install this package: pip install tabulate
import os
import json
import pandas as pd


with open("FP-base-Chinese.json", "r", encoding="utf-8") as file:
    data = json.load(file)
# Create a DataFrame for statistical analysis
df = pd.DataFrame(data)

total_items = len(df)
print("**********************************************************")
print(f"Total number of items in the dataset: {total_items}")
print("**********************************************************")

# Count for each 'data_type'
data_type_counts = df['data_type'].value_counts()
print("**********************************************************")
print("Counts for each 'data_type':")
print(tabulate(data_type_counts.reset_index(), headers=['Data Type', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'subcategory' under each 'category'
subcategory_counts = df.groupby(['category', 'subcategory']).size().reset_index(name='Count')
print("Counts for each 'subcategory' under each 'category':")
print(tabulate(subcategory_counts, headers=['Category', 'Subcategory', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'category'
category_counts = df['category'].value_counts()
print("Counts for each 'category':")
print(tabulate(category_counts.reset_index(), headers=['Category', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'language' (if applicable)
if 'language' in df.columns:
    language_counts = df['language'].value_counts()
    print("Counts for each 'language':")
    print(tabulate(language_counts.reset_index(), headers=['Language', 'Count'], tablefmt='grid'))
    print("**********************************************************")



ImportError: C extension: None not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext' to build the C extensions first.

In [2]:
import json


# Load the data from JSON files
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Save the combined data to a JSON file
def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# Combine two JSON files and re-order the 'id' key
def combine_json(file1, file2, output_file):
    data1 = load_json(file1)
    data2 = load_json(file2)

    # Combine the data
    combined_data = data1 + data2

    # Re-order the 'id' keys
    for index, item in enumerate(combined_data):
        if 'id' in item:
            item['id'] = index + 1

    # Save the combined data
    save_json(combined_data, output_file)


# File paths
file1_path = 'unique_job_postings_more.json'
file2_path = 'FP-base/FP-base-full.json'
output_file_path = 'FP-base/FP-base-full.json'

# Run the combine function
combine_json(file1_path, file2_path, output_file_path)

print(f"Combined data saved to {output_file_path}")


Combined data saved to FP-base/FP-base-full.json


In [1]:
import json

# 读取 JSON 文件
with open("baseline/baseline.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 初始化两个列表
chinese_data = []
english_data = []

# 根据 language 字段拆分数据
for entry in data:
    if entry.get("language") == "Chinese":
        chinese_data.append(entry)
    else:
        english_data.append(entry)
for index, item in enumerate(chinese_data):
        if 'id' in item:
            item['id'] = index + 1
for index, item in enumerate(english_data):
        if 'id' in item:
            item['id'] = index + 1
# 保存拆分后的数据
with open("FP-base-Chinese.json", "w", encoding="utf-8") as f:
    json.dump(chinese_data, f, ensure_ascii=False, indent=4)

with open("FP-base-English.json", "w", encoding="utf-8") as f:
    json.dump(english_data, f, ensure_ascii=False, indent=4)

print("文件拆分完成！")


文件拆分完成！


In [None]:
import json

# Load the JSON file
with open('FP-base/FP-base-Chinese.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

def update_item(item):
    # Check if the item's category is "fake job posting"
    if item.get("category") == "fake job posting":
        # Update the "data type" key to "job post"
        item["data_type"] = "job posting"
    return item

# Update each item in the data list
for item in data:
    update_item(item)

# Save the updated data back to the JSON file
with open('FP-base/FP-base-Chinese.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("Items with category 'fake job posting' have been updated to have data type 'job post'.")


Items with category 'fake job posting' have been updated to have data type 'job post'.


In [6]:
import json

# Load the three JSON files with explicit UTF-8 encoding
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Combine the JSON data
def combine_json(files):
    combined_data = []
    for file in files:
        data = load_json(file)
        combined_data.extend(data)  # Combine the data (assuming list format in JSON)
    return combined_data

# Save the combined JSON data into a new file with UTF-8 encoding
def save_combined_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

# File paths for the three JSON files
json_files = [
    'FP-base/part1_modified.json',
    'FP-base/part2_modified.json',
    'FP-base/part3.json'
]

combined_data = combine_json(json_files)

# Reset the 'id' field for each item if it exists
for index, item in enumerate(combined_data):
    if 'id' in item:
        item['id'] = index + 1

# Save the combined data into a new file
save_combined_json(combined_data, 'FP-base-Chinese.json')

print("Files combined successfully!")




Files combined successfully!


In [1]:
import json

def remove_network_relationships(input_file, output_file=None):
    # 读取 JSON 文件
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 过滤掉 category 为 'network relationship' 的项
    if isinstance(data, list):
        filtered_data = [item for item in data if item.get("category") != "network friendship"]
    elif isinstance(data, dict):
        filtered_data = {k: v for k, v in data.items() if not (isinstance(v, dict) and v.get("category") == "network friendship")}
    else:
        print("Unsupported JSON structure.")
        return
    
    # 如果没有提供 output_file，则覆盖原文件
    if output_file is None:
        output_file = input_file
    
    # 写入过滤后的数据
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_data, f, indent=4, ensure_ascii=False)
    
    print(f"Filtered JSON saved to: {output_file}")

# 示例使用
input_json_file = "./FP-base/FP-base-Chinese.json"  # 你的 JSON 文件名
remove_network_relationships(input_json_file)


Filtered JSON saved to: ./FP-base/FP-base-Chinese.json


In [4]:
import json
def split_file(input_file, num_parts=10):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if not isinstance(data, list):
        print("Only list-based JSON structures are supported for splitting.")
        return
    
    chunk_size = max(1, len(data) // num_parts)
    
    for i in range(num_parts):
        part_data = data[i * chunk_size:(i + 1) * chunk_size]
        part_filename = f"{input_file.rsplit('.', 1)[0]}_part{i + 1}.json"
        with open(part_filename, 'w', encoding='utf-8') as f:
            json.dump(part_data, f, indent=4, ensure_ascii=False)
        print(f"Saved: {part_filename}")

# 示例使用
input_json_file = "./FP-base/FP-base-Chinese.json" # 你的 JSON 文件名
split_file(input_json_file)


Saved: ./FP-base/FP-base-Chinese_part1.json
Saved: ./FP-base/FP-base-Chinese_part2.json
Saved: ./FP-base/FP-base-Chinese_part3.json
Saved: ./FP-base/FP-base-Chinese_part4.json
Saved: ./FP-base/FP-base-Chinese_part5.json
Saved: ./FP-base/FP-base-Chinese_part6.json
Saved: ./FP-base/FP-base-Chinese_part7.json
Saved: ./FP-base/FP-base-Chinese_part8.json
Saved: ./FP-base/FP-base-Chinese_part9.json
Saved: ./FP-base/FP-base-Chinese_part10.json
