In [2]:
import os
import json

# Folder containing the JSON files
folder_path = "./"  # Update to your folder path
output_file = "Fraud24kRaw.json"

# Initialize an empty list to store data from all JSON files
combined_data = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):  # Process only JSON files
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, "r", encoding="utf-8") as json_file:
            data = json.load(json_file)  # Load the content of the JSON file
            # Check if the data is a list or a single object
            if isinstance(data, list):
                combined_data.extend(data)  # Append list data
            else:
                combined_data.append(data)  # Append single object

# Reorder index (assign new unique IDs sequentially)
for idx, item in enumerate(combined_data):
    item['id'] = idx  # Reassign ID to be sequential starting from 0

# Save the combined data to a single JSON file
with open(output_file, "w", encoding="utf-8") as out_file:
    json.dump(combined_data, out_file, indent=4, ensure_ascii=False)

print(f"All JSON files have been combined and reindexed into {output_file}")


All JSON files have been combined and reindexed into Fraud24kRaw.json


In [3]:
import pandas as pd
from tabulate import tabulate  # Install this package: pip install tabulate

with open("sampled_scam_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)
# Create a DataFrame for statistical analysis
df = pd.DataFrame(data)

total_items = len(df)
print("**********************************************************")
print(f"Total number of items in the dataset: {total_items}")
print("**********************************************************")

# Count for each 'data_type'
data_type_counts = df['data_type'].value_counts()
print("**********************************************************")
print("Counts for each 'data_type':")
print(tabulate(data_type_counts.reset_index(), headers=['Data Type', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'subcategory' under each 'category'
subcategory_counts = df.groupby(['category', 'subcategory']).size().reset_index(name='Count')
print("Counts for each 'subcategory' under each 'category':")
print(tabulate(subcategory_counts, headers=['Category', 'Subcategory', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'category'
category_counts = df['category'].value_counts()
print("Counts for each 'category':")
print(tabulate(category_counts.reset_index(), headers=['Category', 'Count'], tablefmt='grid'))
print("**********************************************************")

# Count for each 'language' (if applicable)
if 'language' in df.columns:
    language_counts = df['language'].value_counts()
    print("Counts for each 'language':")
    print(tabulate(language_counts.reset_index(), headers=['Language', 'Count'], tablefmt='grid'))
    print("**********************************************************")



**********************************************************
Total number of items in the dataset: 239
**********************************************************
**********************************************************
Counts for each 'data_type':
+----+-------------+---------+
|    | Data Type   |   Count |
|  0 | dialogue    |     239 |
+----+-------------+---------+
**********************************************************
Counts for each 'subcategory' under each 'category':
+----+--------------------+------------------------------------------------------------------+---------+
|    | Category           | Subcategory                                                      |   Count |
|  0 | fraudulent service | e-commerce logistics and shopping                                |      50 |
+----+--------------------+------------------------------------------------------------------+---------+
|  1 | fraudulent service | investment and financial management                              |  

In [5]:
import json

# Load the JSON file
with open("Fraud24kRaw_full.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Filter out items with subcategory in the specified list
filtered_data = [
    item for item in data if item.get("subcategory") not in {"ssn", "refund", "support", "reward"}
]

# Update subcategory values from "fraud" to "fraud email" and "phishing" to "phishing email"
for item in filtered_data:
    if item.get("subcategory") == "fraud":
        item["subcategory"] = "fraud email"
    elif item.get("subcategory") == "phishing":
        item["subcategory"] = "phishing email"

# Save the filtered and updated data back to a new JSON file while preserving Chinese characters
with open("Fraud24kFiltered.json", "w", encoding="utf-8") as file:
    json.dump(filtered_data, file, indent=4, ensure_ascii=False)


In [2]:
import json
import random

# Load the data from JSON files
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Save the sampled data to a JSON file
def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# Sample items for each subcategory while ensuring uniqueness
def sample_items(scams_file, fraud_file, output_file):
    scam_data = load_json(scams_file)
    fraud_data = load_json(fraud_file)

    # Extract raw_data from Fraud24kFiltered for comparison
    # Extract raw_data from Fraud24kFiltered for comparison
    # Extract raw_data from Fraud24kFiltered for comparison
    fraud_raw_data = set()
    for item in fraud_data:
        if isinstance(item, dict) and isinstance(item.get('raw_data'), str):
            fraud_raw_data.add(item['raw_data'])



    # Group scam data by subcategory
    subcategory_groups = {}
    for item in scam_data:
        subcat = item['subcategory']
        if subcat not in subcategory_groups:
            subcategory_groups[subcat] = []
        subcategory_groups[subcat].append(item)

    # Sample 50 unique items per subcategory
    sampled_items = []
    for subcat, items in subcategory_groups.items():
        unique_items = [item for item in items if item['raw_data'] not in fraud_raw_data]
        sampled = random.sample(unique_items, min(50, len(unique_items)))
        sampled_items.extend(sampled)

    # Save the sampled data to the output file
    save_json(sampled_items, output_file)

# File paths
scam_file_path = 'original_40k/scam_dialogue_chinese.json'
fraud_file_path = 'Fraud24kFiltered.json'
output_file_path = 'sampled_scam_data.json'

# Run the sampling function
sample_items(scam_file_path, fraud_file_path, output_file_path)

print(f"Sampled items saved to {output_file_path}")


Sampled items saved to sampled_scam_data.json
