In [None]:
# import os
# import json
# import random
# import glob

# def create_dataset_json_multisite(root_dir, output_folder):
#     # Get a list of all image files in the root directory
#     subject_list = glob.glob(os.path.join(root_dir, '*.png'))  # Update the file extension if necessary
    
#     # Group hospitals
#     hospital_groups = {
#         '18': 'hospital18.json',
#         '43': 'hospital43.json',
#         '55': 'hospital55.json'
#     }
    
#     for hospital_ids, output_filename in hospital_groups.items():
#         # Filter images for the current hospital group
#         filtered_images = [file for file in subject_list if any(f"_hospital_{hospital_id}_" in file for hospital_id in hospital_ids.split(','))]
        
#         # Shuffle the images and split them into train and val sets (70-30 split)
#         random.shuffle(filtered_images)
#         num_validation = int(len(filtered_images) * 0.3)
#         train_data = filtered_images[:-num_validation]
#         val_data = filtered_images[-num_validation:]
        
#         # Create the dataset dictionary
#         dataset = {"train": [], "val": []}
#         for file in train_data:
#             dataset["train"].append({
#                 "image_path": file,
#                 "hospital_id": hospital_ids,
#                 "label": os.path.basename(file).split('_')[4].split('.')[0]  # Extract label information, adjust the index accordingly
#             })
#         for file in val_data:
#             dataset["val"].append({
#                 "image_path": file,
#                 "hospital_id": hospital_ids,
#                 "label": os.path.basename(file).split('_')[4].split('.')[0]  # Extract label information, adjust the index accordingly
#             })

#         # Write the dataset to JSON file
#         with open(os.path.join(output_folder, output_filename), 'w') as json_file:
#             json.dump(dataset, json_file, indent=4)
            
#         print(f"JSON file '{output_filename}' created for hospital IDs: {hospital_ids}")

# # Example usage
# root_directory = "/local/data1/honzh073/data/8bit_down224"
# output_folder = "/local/data1/honzh073/local_repository/FL/code/5_create_json_files"

# create_dataset_json_multisite(root_directory, output_folder)


# create different json files

In [38]:
import os
import json
import random
import pandas as pd
from sklearn.model_selection import train_test_split

def create_dataset_json(csv_path, output_folder):
    # Read CSV file
    df = pd.read_csv(csv_path)

    # Get unique PatientIDs
    patient_ids = df['PatientID'].unique()

    # Split data into train and test sets (70-30 split)
    random_state = 3
    train_patient_ids, val_patient_ids = train_test_split(patient_ids, test_size=0.3, random_state=random_state)

    # Filter data for the train and test sets
    train_data = df[df['PatientID'].isin(train_patient_ids)]
    val_data = df[df['PatientID'].isin(val_patient_ids)]

    # Create the dataset dictionary
    dataset = {"train": [], "val": []}
    for _, row in train_data.iterrows():
        dataset["train"].append({
            "HospitalID": row['HospitalID'],
            "PatientID": row['PatientID'],
            "ImageID": row['ImageID'],
            "ImagePath": row['ImagePath'],
            "Label": row['Label']
        })
    for _, row in val_data.iterrows():
        dataset["val"].append({
            "HospitalID": row['HospitalID'],
            "PatientID": row['PatientID'],
            "ImageID": row['ImageID'],
            "ImagePath": row['ImagePath'],
            "Label": row['Label']
        })

    # Write the dataset to a JSON file
    output_filename = "hospital22_3.json"
    output_path = os.path.join(output_folder, output_filename)
    with open(output_path, 'w') as json_file:
        json.dump(dataset, json_file, indent=4)

    print(f"JSON file '{output_filename}' created for training and testing sets.")

# Example usage
csv_path = "/local/data1/honzh/local_repository/FL/code/0_stats_pyradiomics/single_hospital_csv/hospital22.csv"
output_folder = "/local/data1/honzh/local_repository/FL/code/1_create_data/data2"

create_dataset_json(csv_path, output_folder)


JSON file 'hospital22_3.json' created for training and testing sets.


# statistics

In [39]:
import json
import pandas as pd

def print_statistics(json_file_path, dataset_type):
    # 读取JSON文件
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    # 将JSON数据转换为DataFrame
    df = pd.DataFrame(data[dataset_type])

    # 统计患者ID数量
    num_patients = df['PatientID'].nunique()

    # 统计AFF和NFF标签的图片数量
    num_aff_images = df[df['Label'] == 'AFF'].shape[0]
    num_nff_images = df[df['Label'] == 'NFF'].shape[0]

    # 计算比例
    ratio_aff = num_aff_images / len(df)
    ratio_nff = num_nff_images / len(df)

    # 打印统计信息和比例
    print(f"{dataset_type.capitalize()}集合:")
    print(f"患者ID数量: {num_patients}")
    print(f"AFF标签的图片数量: {num_aff_images} ({ratio_aff:.2%})")
    print(f"NFF标签的图片数量: {num_nff_images} ({ratio_nff:.2%})")
    print()

# Example usage
json_file_path = "/local/data1/honzh/local_repository/FL/code/1_create_data/data2/hospital22_3.json"
print_statistics(json_file_path, 'train')
print_statistics(json_file_path, 'val')


Train集合:
患者ID数量: 17
AFF标签的图片数量: 30 (44.12%)
NFF标签的图片数量: 38 (55.88%)

Val集合:
患者ID数量: 8
AFF标签的图片数量: 14 (48.28%)
NFF标签的图片数量: 15 (51.72%)



# 分层抽样

In [32]:
# import os
# import json
# import random
# import pandas as pd
# from sklearn.model_selection import train_test_split

# def create_dataset_json(csv_path, output_folder):
#     # Read CSV file
#     df = pd.read_csv(csv_path)

#     # Split data into train and test sets with stratified sampling
#     random_state = 2
#     train_data, val_data = train_test_split(df, test_size=0.3, random_state=random_state, stratify=df['Label'])

#     # Create the dataset dictionary
#     dataset = {"train": [], "val": []}
#     for _, row in train_data.iterrows():
#         dataset["train"].append({
#             "HospitalID": row['HospitalID'],
#             "PatientID": row['PatientID'],
#             "ImageID": row['ImageID'],
#             "ImagePath": row['ImagePath'],
#             "Label": row['Label']
#         })
#     for _, row in val_data.iterrows():
#         dataset["val"].append({
#             "HospitalID": row['HospitalID'],
#             "PatientID": row['PatientID'],
#             "ImageID": row['ImageID'],
#             "ImagePath": row['ImagePath'],
#             "Label": row['Label']
#         })

#     # Write the dataset to a JSON file
#     output_filename = "hospital18_3.json"
#     output_path = os.path.join(output_folder, output_filename)
#     with open(output_path, 'w') as json_file:
#         json.dump(dataset, json_file, indent=4)

#     print(f"JSON file '{output_filename}' created for training and testing sets.")

# # Example usage
# csv_path = "/local/data1/honzh073/data/hospital18.csv"
# output_folder = "/local/data1/honzh073/data"

# create_dataset_json(csv_path, output_folder)


JSON file 'hospital18_3.json' created for training and testing sets.


In [38]:
# import json
# import pandas as pd

# def print_statistics(json_file_path, dataset_type):
#     # 读取JSON文件
#     with open(json_file_path, 'r') as file:
#         data = json.load(file)

#     # 将JSON数据转换为DataFrame
#     df = pd.DataFrame(data[dataset_type])

#     # 统计患者ID数量
#     num_patients = df['PatientID'].nunique()

#     # 统计AFF和NFF标签的图片数量
#     num_aff_images = df[df['Label'] == 'AFF'].shape[0]
#     num_nff_images = df[df['Label'] == 'NFF'].shape[0]

#     # 计算比例
#     ratio_aff = num_aff_images / len(df)
#     ratio_nff = num_nff_images / len(df)

#     # 打印统计信息和比例
#     print(f"{dataset_type.capitalize()}集合:")
#     print(f"患者ID数量: {num_patients}")
#     print(f"AFF标签的图片数量: {num_aff_images} ({ratio_aff:.2%})")
#     print(f"NFF标签的图片数量: {num_nff_images} ({ratio_nff:.2%})")
#     print()

# # Example usage
# json_file_path = "/local/data1/honzh073/data/hospital18_2.json"
# print_statistics(json_file_path, 'train')
# print_statistics(json_file_path, 'val')


Train集合:
患者ID数量: 32
AFF标签的图片数量: 30 (34.09%)
NFF标签的图片数量: 58 (65.91%)

Val集合:
患者ID数量: 25
AFF标签的图片数量: 13 (34.21%)
NFF标签的图片数量: 25 (65.79%)

