In [2]:
import os
import shutil
import random
from tqdm import tqdm

def split_dataset_by_patient(input_root, output_root, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2, seed=42):
    """
    以「病人ID資料夾」為單位，分割整個預處理後的資料集

    Args:
        input_root: 預處理後的資料集根目錄 (processed_dataset/)
        output_root: 分割後的資料集根目錄 (dataset_split/)
        train_ratio, val_ratio, test_ratio: 比例
        seed: 隨機種子，確保重現結果
    """
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "比例總和必須為 1"
    
    random.seed(seed)
    os.makedirs(output_root, exist_ok=True)
    splits = ['train', 'val', 'test']

    for s in splits:
        os.makedirs(os.path.join(output_root, s), exist_ok=True)

    # 🔹 逐類別處理
    for category in sorted(os.listdir(input_root)):
        category_path = os.path.join(input_root, category)
        if not os.path.isdir(category_path):
            continue

        print(f"\n🔹 分割類別: {category}")

        # 建立對應輸出目錄
        for s in splits:
            os.makedirs(os.path.join(output_root, s, category), exist_ok=True)

        # 收集所有病人資料夾
        patient_dirs = [
            os.path.join(category_path, d)
            for d in os.listdir(category_path)
            if os.path.isdir(os.path.join(category_path, d))
        ]

        random.shuffle(patient_dirs)

        total = len(patient_dirs)
        n_train = round(total * train_ratio)
        n_val = round(total * val_ratio)
        n_test = total - n_train - n_val  # 確保總數正確

        train_patients = patient_dirs[:n_train]
        val_patients = patient_dirs[n_train:n_train + n_val]
        test_patients = patient_dirs[n_train + n_val:]

        # 🔹 定義一個函式幫忙複製整個病人資料夾
        def copy_patient_dirs(patient_list, split_name):
            for patient_dir in tqdm(patient_list, desc=f"{category} → {split_name}"):
                patient_id = os.path.basename(patient_dir)
                dest_dir = os.path.join(output_root, split_name, category, patient_id)
                os.makedirs(dest_dir, exist_ok=True)

                # 複製所有影像
                for file in os.listdir(patient_dir):
                    if file.lower().endswith(('.jpg', '.jpeg', '.png', '.tif')):
                        src = os.path.join(patient_dir, file)
                        dst = os.path.join(dest_dir, file)
                        shutil.copy2(src, dst)

        # 🔹 開始複製
        copy_patient_dirs(train_patients, "train")
        copy_patient_dirs(val_patients, "val")
        copy_patient_dirs(test_patients, "test")

        print(f"✅ 類別 {category} 分割完成：Train={len(train_patients)}, Val={len(val_patients)}, Test={len(test_patients)}")

    print("\n🎉 所有分類皆已完成『以病人ID為單位』的資料分割！")


# ======================
# 🚀 主程式入口
# ======================
if __name__ == "__main__":
    input_root = "../datasets"    # 預處理後的資料夾
    output_root = "dataset_split"       # 分割輸出路徑

    split_dataset_by_patient(
        input_root=input_root,
        output_root=output_root,
        train_ratio=0.7,
        val_ratio=0.1,
        test_ratio=0.2,
        seed=42
    )


🔹 分割類別: Category 0


Category 0 → train: 100%|██████████| 836/836 [00:10<00:00, 81.49it/s] 
Category 0 → val: 100%|██████████| 120/120 [00:06<00:00, 17.66it/s]
Category 0 → test: 100%|██████████| 239/239 [00:02<00:00, 107.74it/s]


✅ 類別 Category 0 分割完成：Train=836, Val=120, Test=239

🔹 分割類別: Category 1


Category 1 → train: 100%|██████████| 349/349 [00:16<00:00, 20.89it/s]
Category 1 → val: 100%|██████████| 50/50 [00:00<00:00, 119.09it/s]
Category 1 → test: 100%|██████████| 100/100 [00:00<00:00, 107.19it/s]


✅ 類別 Category 1 分割完成：Train=349, Val=50, Test=100

🔹 分割類別: Category 2


Category 2 → train: 100%|██████████| 350/350 [00:05<00:00, 59.44it/s] 
Category 2 → val: 100%|██████████| 50/50 [00:02<00:00, 18.30it/s]
Category 2 → test: 100%|██████████| 100/100 [00:09<00:00, 10.72it/s]


✅ 類別 Category 2 分割完成：Train=350, Val=50, Test=100

🔹 分割類別: Category 3


Category 3 → train: 100%|██████████| 349/349 [00:03<00:00, 106.89it/s]
Category 3 → val: 100%|██████████| 50/50 [00:01<00:00, 33.09it/s]
Category 3 → test: 100%|██████████| 100/100 [00:02<00:00, 35.70it/s]


✅ 類別 Category 3 分割完成：Train=349, Val=50, Test=100

🔹 分割類別: Category 4


Category 4 → train: 100%|██████████| 183/183 [00:11<00:00, 15.45it/s] 
Category 4 → val: 100%|██████████| 26/26 [00:00<00:00, 106.42it/s]
Category 4 → test: 100%|██████████| 52/52 [00:00<00:00, 119.62it/s]


✅ 類別 Category 4 分割完成：Train=183, Val=26, Test=52

🔹 分割類別: Category 5


Category 5 → train: 100%|██████████| 27/27 [00:00<00:00, 100.25it/s]
Category 5 → val: 100%|██████████| 4/4 [00:00<00:00, 97.12it/s]
Category 5 → test: 100%|██████████| 8/8 [00:00<00:00, 100.59it/s]


✅ 類別 Category 5 分割完成：Train=27, Val=4, Test=8

🔹 分割類別: Category 6


Category 6 → train: 100%|██████████| 49/49 [00:00<00:00, 112.29it/s]
Category 6 → val: 100%|██████████| 7/7 [00:00<00:00, 123.02it/s]
Category 6 → test: 100%|██████████| 14/14 [00:00<00:00, 116.76it/s]

✅ 類別 Category 6 分割完成：Train=49, Val=7, Test=14

🎉 所有分類皆已完成『以病人ID為單位』的資料分割！





In [6]:
# print img size
# pip install opencv-python
import cv2

img_path = "/home/stoneyew/Desktop/PACS/datasets_ori/Category 1/XA2015110006335/XA2015110006335-2.jpg"
img = cv2.imread(img_path)
print(img.shape)  # (height, width, channels)

(2294, 1914, 3)


In [10]:
import os
import pandas as pd
from collections import defaultdict

def load_patient_descriptions():
    """
    Load patient descriptions from CATEGORY CSV files.
    Returns a dictionary mapping patient_id to description.
    """
    descriptions = {}
    base_datasets_path = "/home/stoneyew/Desktop/PACS/datasets/"
    
    # Try to load descriptions from each category CSV file
    for category_num in range(7):  # Categories 0-6
        csv_path = os.path.join(base_datasets_path, f"Category {category_num}.csv")
        if os.path.exists(csv_path):
            try:
                df = pd.read_csv(csv_path)
                # Assuming the CSV has columns like 'patient_id' and 'description'
                # Adjust column names based on actual CSV structure
                for _, row in df.iterrows():
                    patient_id = str(row.iloc[0])  # Assuming first column is patient ID
                    description = str(row.iloc[-1]) if len(row) > 1 else "No description"  # Last column as description
                    descriptions[patient_id] = description
                print(f"Loaded {len(df)} descriptions from {csv_path}")
            except Exception as e:
                print(f"Warning: Could not load descriptions from {csv_path}: {e}")
        else:
            print(f"Description file not found: {csv_path}")
    
    return descriptions

def generate_multiview_csv_from_folders(base_dir, output_csv):
    """
    直接從資料夾結構掃描影像，生成多視角模型所需的寬格式 CSV 檔案。
    and go to "/home/stoneyew/Desktop/PACS/datasets/CATEGORY #.csv" get description of each patient

    Args:
        base_dir (str): 要掃描的基礎目錄 (例如: train/ 或 val/)。
        output_csv (str): 輸出的寬格式 CSV 檔案路徑。
    """
    # Load patient descriptions
    patient_descriptions = load_patient_descriptions()
    
    # 使用 defaultdict 來讓我們可以方便地為新病人建立字典
    patients_data = defaultdict(dict)

    print(f"正在掃描目錄: {base_dir}")
    # 遍歷第一層：Category 資料夾
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        if not os.path.isdir(category_path):
            continue

        # 從 'Category X' 中提取標籤數字
        try:
            label = int(category.replace("Category ", ""))
            print(f"處理 Category {label} 資料夾")
            if label == 6:
                print(f"跳過 Category {label} 資料夾")
                continue # 跳過 Category 0 和 6 和 2
            # else:
            #     label -=1  # 調整標籤，將 Category 3,4,5 分別變成 2,3,4
            # elif label == 0 or label == 4 or label == 5:
            #     label = 1
            # else:
            #     label = 0
        except ValueError:
            print(f"警告：跳過無法識別的資料夾名稱 '{category}'")
            continue

        # 遍歷第二層：Patient 資料夾
        for patient_folder in os.listdir(category_path):
            patient_path = os.path.join(category_path, patient_folder)
            if not os.path.isdir(patient_path):
                continue
            
            # 使用相對路徑作為病人的唯一 ID
            patient_id = os.path.join(category, patient_folder)
            
            # 設定該病人的標籤
            patients_data[patient_id]['label'] = label
            
            # Add patient description
            # Extract just the patient folder name for description lookup
            patient_name = patient_folder
            description = patient_descriptions.get(patient_name, "No description available")
            patients_data[patient_id]['description'] = description

            # 遍歷第三層：影像檔案
            for img_file in os.listdir(patient_path):
                if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    # 從檔名中提取視角 (例如: 'L-MLO.jpg' -> 'L-MLO')
                    view = os.path.splitext(img_file)[0]
                    # if view == 'L-CC' :
                    #     view = 'inputs_cc'
                    # elif view == 'R-CC':
                    #     continue
                    # elif view == 'L-MLO':
                    #     view = 'inputs_mlo'
                    # elif view == 'R-MLO':
                    #     continue
                    # 儲存該視角的相對路徑
                    relative_path = os.path.join(patient_id, img_file)
                    patients_data[patient_id][view] = relative_path

    if not patients_data:
        print(f"警告：在 {base_dir} 中沒有找到任何影像資料。")
        return

    # 將字典轉換為 DataFrame
    df = pd.DataFrame.from_dict(patients_data, orient='index')

    # 重新排列欄位，確保順序正確，並包含 description
    required_columns = ['L-CC', 'R-CC', 'L-MLO', 'R-MLO', 'label', 'description']
    # required_columns =['inputs_cc', 'inputs_mlo', 'label', 'description']
    df = df.reindex(columns=required_columns)

    # 移除缺少任何一個視角的病人資料（可選，但建議）
    # Note: We don't drop rows with missing descriptions, only missing image views
    original_count = len(df)
    # Only check for missing image columns, not description
    image_columns = ['L-CC', 'R-CC', 'L-MLO', 'R-MLO']
    df_images_complete = df.dropna(subset=image_columns)
    dropped_count = original_count - len(df_images_complete)
    if dropped_count > 0:
        print(f"移除了 {dropped_count} 位缺少完整四視角影像的病人。")
    
    # Use the filtered dataframe
    df = df_images_complete

    # 儲存為 CSV
    df.to_csv(output_csv, index=False)
    print(f"多視角 CSV 檔案已儲存: {output_csv} (共 {len(df)} 位病人)")
    print("部分資料預覽：")
    print(df.head())


if __name__ == '__main__':
    # --- 請根據您的資料夾結構修改基礎路徑 ---
    base_dir = "dataset_split"

    # 為 train 資料夾產生多視角 CSV
    train_dir = os.path.join(base_dir, "train")
    output_train_csv = os.path.join(base_dir, "train_labels.csv")
    print("\n--- 正在處理 Train 資料夾 ---")
    generate_multiview_csv_from_folders(train_dir, output_train_csv)

    # 為 val 資料夾產生多視角 CSV
    val_dir = os.path.join(base_dir, "val")
    output_val_csv = os.path.join(base_dir, "val_labels.csv")
    print("\n--- 正在處理 Val 資料夾 ---")
    generate_multiview_csv_from_folders(val_dir, output_val_csv)

    # (可選) 為 test 資料夾產生多視角 CSV
    test_dir = os.path.join(base_dir, "test")
    if os.path.exists(test_dir):
        output_test_csv = os.path.join(base_dir, "test_labels.csv")
        print("\n--- 正在處理 Test 資料夾 ---")
        generate_multiview_csv_from_folders(test_dir, output_test_csv)


--- 正在處理 Train 資料夾 ---
Loaded 1197 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 0.csv
Loaded 499 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 1.csv
Loaded 499 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 2.csv
Loaded 499 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 3.csv
Loaded 262 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 4.csv
Loaded 38 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 5.csv
Description file not found: /home/stoneyew/Desktop/PACS/datasets/Category 6.csv
正在掃描目錄: dataset_split/train
處理 Category 0 資料夾
處理 Category 3 資料夾
處理 Category 1 資料夾
處理 Category 6 資料夾
跳過 Category 6 資料夾
處理 Category 4 資料夾
處理 Category 5 資料夾
處理 Category 2 資料夾
多視角 CSV 檔案已儲存: dataset_split/train_labels.csv (共 2094 位病人)
部分資料預覽：
                                                           L-CC  \
Category 0/XA2017040012532  Category 0/XA2017040012532/L-CC.jpg   
Category 0/XA2017020000760  Category 