In [2]:
import os
import csv

def create_csv(image_folder, csv_file_path):
    # CSV header
    fieldnames = ['HospitalID', 'PatientID', 'ImageID', 'ImagePath', 'Label']
    
    # open the CSV file in write mode
    with open(csv_file_path, 'w', newline='') as csvfile:
        # Create a CSV writer object and write the header
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for filename in sorted(os.listdir(image_folder)):
            # complete image file path
            image_path = os.path.join(image_folder, filename)

            # filename to extract HospitalID, PatientID, image number, and image label
            parts = filename.split('_')
            hospital_id = parts[3]
            patient_id = parts[1]
            image_number = parts[6]
            image_label = parts[4]

            # write data into the CSV file
            writer.writerow({
                'HospitalID': hospital_id,
                'PatientID': patient_id,
                'ImageID': image_number,
                'ImagePath': image_path,
                'Label': image_label
            })

    # created and saved
    print("CSV file has been created and saved to:", csv_file_path)


image_folder = "/local/data1/honzh073/data/8bit_down224"
csv_file_path = "/local/data1/honzh073/local_repository/FL/code/5_create_dataset/all_image.csv"

create_csv(image_folder, csv_file_path)


CSV file has been created and saved to: /local/data1/honzh073/local_repository/FL/code/5_create_dataset/all_image.csv


# single hospital

In [2]:
import csv
import os
from collections import defaultdict

def filter_hospital_data(input_csv_path, target_hospital_ids):
    patient_data = defaultdict(list)

    with open(input_csv_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['HospitalID'] in target_hospital_ids:
                patient_id = row['PatientID']
                patient_data[patient_id].append(row)

    hospital_55_data = []
    for images in patient_data.values():
        hospital_55_data.extend(images)

    return hospital_55_data

# Input and output paths
input_csv_path = "/local/data1/honzh073/local_repository/FL/code/5_create_dataset/all_image.csv"

output_folder = "/local/data1/honzh073/local_repository/FL/code/5_create_dataset"

# Single hospital id
target_hospital_ids = ['55']

# Get data for hospital 55
hospital_data = filter_hospital_data(input_csv_path, target_hospital_ids)

# Write hospital 55 data to CSV file
def write_to_csv(file_path, data):
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

# Save hospital 55 data to 'hospital55.csv'
write_to_csv(os.path.join(output_folder, 'hospital55.csv'), hospital_data)

print("Saved single hospital csv.")


Saved single hospital csv.


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 读取CSV文件
csv_path = "/local/data1/honzh073/local_repository/FL/code/5_create_dataset/hospital18.csv"
df = pd.read_csv(csv_path)

# 获取所有患者ID
patient_ids = df['PatientID'].unique()

# 随机划分训练集和验证集
train_patient_ids, val_patient_ids = train_test_split(patient_ids, test_size=0.3, random_state=42)

# 根据患者ID筛选数据
train_data = df[df['PatientID'].isin(train_patient_ids)]
val_data = df[df['PatientID'].isin(val_patient_ids)]

# 保存划分后的CSV文件
train_csv_path = "/local/data1/honzh073/local_repository/FL/code/5_create_dataset/train.csv"
val_csv_path = "/local/data1/honzh073/local_repository/FL/code/5_create_dataset/val.csv"

train_data.to_csv(train_csv_path, index=False)
val_data.to_csv(val_csv_path, index=False)

print(f"已保存训练集CSV文件: {train_csv_path}")
print(f"已保存验证集CSV文件: {val_csv_path}")


已保存训练集CSV文件: /local/data1/honzh073/local_repository/FL/code/5_create_dataset/train.csv
已保存验证集CSV文件: /local/data1/honzh073/local_repository/FL/code/5_create_dataset/val.csv


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import time

def split_csv_by_patient():
    train_ratio=0.7
    csv_path = '/local/data1/honzh073/data/data.csv'
    output_folder = '/local/data1/honzh073/data'
    
    # 读取CSV文件
    df = pd.read_csv(csv_path)

    # 获取所有患者ID
    patient_ids = df['PatientID'].unique()

    # 生成随机种子
    random_seed = int(time.time())
    random.seed(random_seed)

    # 随机划分训练集和验证集
    train_patient_ids, val_patient_ids = train_test_split(patient_ids, test_size=1 - train_ratio, random_state=random_seed)

    # 根据患者ID筛选数据
    train_data = df[df['PatientID'].isin(train_patient_ids)]
    val_data = df[df['PatientID'].isin(val_patient_ids)]

    # 保存划分后的CSV文件
    train_csv_path = f"{output_folder}/train.csv"
    val_csv_path = f"{output_folder}/val.csv"

    train_data.to_csv(train_csv_path, index=False)
    val_data.to_csv(val_csv_path, index=False)

    print(f"已保存训练集CSV文件: {train_csv_path}")
    print(f"已保存验证集CSV文件: {val_csv_path}")

# 示例用法
split_csv_by_patient()


已保存训练集CSV文件: /local/data1/honzh073/data/train.csv
已保存验证集CSV文件: /local/data1/honzh073/data/val.csv
