clean incomplete patient
e.g. image number != 4

In [8]:
import os
import shutil

def clean_incomplete_patients(root_dir, expected_num=4):
    removed = []
    kept = 0
    total = 0

    for category in sorted(os.listdir(root_dir)):
        cat_path = os.path.join(root_dir, category)
        if not os.path.isdir(cat_path):
            continue

        for patient in sorted(os.listdir(cat_path)):
            p_path = os.path.join(cat_path, patient)
            if not os.path.isdir(p_path):
                continue

            imgs = [f for f in os.listdir(p_path)
                    if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.bmp'))]
            total += 1

            if len(imgs) < expected_num:
                removed.append((category, patient, len(imgs)))
                shutil.rmtree(p_path)
            if len(imgs) > expected_num:
                print(f"⚠️  Warning: Patient {patient} in category {category} has {len(imgs)} images (expected {expected_num}).")
                print("    Please check manually.")
                kept += 1
            else:
                kept += 1

    print("🩻 Patient integrity cleaning complete.")
    print(f"✅ Kept folders   : {kept}")
    print(f"🗑️  Removed folders: {len(removed)} / {total} total")

    if removed:
        print("\nExamples of removed folders:")
        for i, (cat, pid, n) in enumerate(removed[:10]):
            print(f" {i+1}. [{cat}] {pid} → {n} images (removed)")
    else:
        print("🎉 All folders are complete (4 images each).")

# 使用範例
clean_incomplete_patients("/home/stoneyew/Desktop/PACS/check")

    Please check manually.
    Please check manually.
    Please check manually.
    Please check manually.
🩻 Patient integrity cleaning complete.
✅ Kept folders   : 3070
🗑️  Removed folders: 7 / 3070 total

Examples of removed folders:
 1. [Category 0] XA2018030016333 → 2 images (removed)
 2. [Category 0] XA2018050022778 → 2 images (removed)
 3. [Category 0] XA2018100015145 → 2 images (removed)
 4. [Category 1] XA2016110014744 → 3 images (removed)
 5. [Category 3] XA2017090014017 → 2 images (removed)
 6. [Category 4] XA2017080014795 → 2 images (removed)
 7. [Category 4] XA2017080021329 → 2 images (removed)


# rename image by order

In [11]:
import os
from tqdm import tqdm

def rename_mammogram_images_by_order(root_dir, keep_original=False):
    """
    依照每個病人資料夾內的影像順序重新命名：
    第1張 → R-CC
    第2張 → L-CC
    第3張 → R-MLO
    第4張 → L-MLO

    Args:
        root_dir: 主要資料夾路徑，內含各類別子資料夾
        keep_original: 若為 True，保留原始檔案並複製新檔；否則直接改名
    """

    # 命名順序
    rename_order = ["R-CC", "L-CC", "R-MLO", "L-MLO"]
    valid_exts = ('.jpg', '.jpeg', '.png', '.tif')

    # 遍歷類別與病人資料夾
    for category in sorted(os.listdir(root_dir)):
        category_path = os.path.join(root_dir, category)
        if not os.path.isdir(category_path):
            continue

        for patient_id in sorted(os.listdir(category_path)):
            patient_path = os.path.join(category_path, patient_id)
            if not os.path.isdir(patient_path):
                continue

            # 找出該病人資料夾下所有影像
            image_files = [
                f for f in os.listdir(patient_path)
                if f.lower().endswith(valid_exts)
            ]

            if not image_files:
                continue

            # 依檔名排序
            image_files.sort()

            for idx, file in enumerate(tqdm(image_files, desc=f"Renaming {patient_id}", leave=False)):
                ext = os.path.splitext(file)[1].lower()
                if idx < len(rename_order):
                    new_name = rename_order[idx] + ext # 依順序命名
                else:
                    print(f"⚠️ 注意：{patient_id} 有超過四張影像，超過的部分將命名為 extra_X")
                    new_name = f"extra_{idx+1}{ext}"  # 超過四張的情況

                old_path = os.path.join(patient_path, file)
                new_path = os.path.join(patient_path, new_name)

                if os.path.exists(new_path):
                    continue

                if keep_original:
                    import shutil
                    shutil.copy2(old_path, new_path)
                else:
                    os.rename(old_path, new_path)

    print("\n✅ 所有影像已依指定順序重新命名完成！")


# 🚀 使用範例
if __name__ == "__main__":
    root_dir = "/home/stoneyew/Desktop/PACS/check" 
    rename_mammogram_images_by_order(root_dir, keep_original=False)


                                                               


✅ 所有影像已依指定順序重新命名完成！




#spilt datasets

In [12]:
import os
import shutil
import random
from tqdm import tqdm

def split_dataset_by_patient(input_root, output_root, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2, seed=42):
    """
    以「病人ID資料夾」為單位，分割整個預處理後的資料集

    Args:
        input_root: 預處理後的資料集根目錄 (processed_dataset/)
        output_root: 分割後的資料集根目錄 (dataset_split/)
        train_ratio, val_ratio, test_ratio: 比例
        seed: 隨機種子，確保重現結果
    """
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "比例總和必須為 1"
    
    random.seed(seed)
    os.makedirs(output_root, exist_ok=True)
    splits = ['train', 'val', 'test']

    for s in splits:
        os.makedirs(os.path.join(output_root, s), exist_ok=True)

    # 🔹 逐類別處理
    for category in sorted(os.listdir(input_root)):
        category_path = os.path.join(input_root, category)
        if not os.path.isdir(category_path):
            continue

        print(f"\n🔹 分割類別: {category}")

        # 建立對應輸出目錄
        for s in splits:
            os.makedirs(os.path.join(output_root, s, category), exist_ok=True)

        # 收集所有病人資料夾
        patient_dirs = [
            os.path.join(category_path, d)
            for d in os.listdir(category_path)
            if os.path.isdir(os.path.join(category_path, d))
        ]

        random.shuffle(patient_dirs)

        total = len(patient_dirs)
        n_train = round(total * train_ratio)
        n_val = round(total * val_ratio)
        n_test = total - n_train - n_val  # 確保總數正確

        train_patients = patient_dirs[:n_train]
        val_patients = patient_dirs[n_train:n_train + n_val]
        test_patients = patient_dirs[n_train + n_val:]

        # 🔹 定義一個函式幫忙複製整個病人資料夾
        def copy_patient_dirs(patient_list, split_name):
            for patient_dir in tqdm(patient_list, desc=f"{category} → {split_name}"):
                patient_id = os.path.basename(patient_dir)
                dest_dir = os.path.join(output_root, split_name, category, patient_id)
                os.makedirs(dest_dir, exist_ok=True)

                # 複製所有影像
                for file in os.listdir(patient_dir):
                    if file.lower().endswith(('.jpg', '.jpeg', '.png', '.tif')):
                        src = os.path.join(patient_dir, file)
                        dst = os.path.join(dest_dir, file)
                        shutil.copy2(src, dst)

        # 🔹 開始複製
        copy_patient_dirs(train_patients, "train")
        copy_patient_dirs(val_patients, "val")
        copy_patient_dirs(test_patients, "test")

        print(f"✅ 類別 {category} 分割完成：Train={len(train_patients)}, Val={len(val_patients)}, Test={len(test_patients)}")

    print("\n🎉 所有分類皆已完成『以病人ID為單位』的資料分割！")

if __name__ == "__main__":
    input_root = "/home/stoneyew/Desktop/PACS/check"    # 預處理後的資料夾
    output_root = "dataset_split"       # 分割輸出路徑

    split_dataset_by_patient(
        input_root=input_root,
        output_root=output_root,
        train_ratio=0.7,
        val_ratio=0.1,
        test_ratio=0.2,
        seed=42
    )


🔹 分割類別: Category 0


Category 0 → train: 100%|██████████| 836/836 [00:16<00:00, 51.34it/s]
Category 0 → val: 100%|██████████| 120/120 [00:04<00:00, 29.34it/s]
Category 0 → test: 100%|██████████| 239/239 [00:10<00:00, 23.10it/s]


✅ 類別 Category 0 分割完成：Train=836, Val=120, Test=239

🔹 分割類別: Category 1


Category 1 → train: 100%|██████████| 349/349 [00:15<00:00, 23.11it/s]
Category 1 → val: 100%|██████████| 50/50 [00:00<00:00, 54.63it/s]
Category 1 → test: 100%|██████████| 100/100 [00:01<00:00, 58.53it/s]


✅ 類別 Category 1 分割完成：Train=349, Val=50, Test=100

🔹 分割類別: Category 2


Category 2 → train: 100%|██████████| 350/350 [00:15<00:00, 22.96it/s]
Category 2 → val: 100%|██████████| 50/50 [00:00<00:00, 56.17it/s]
Category 2 → test: 100%|██████████| 100/100 [00:03<00:00, 31.89it/s]


✅ 類別 Category 2 分割完成：Train=350, Val=50, Test=100

🔹 分割類別: Category 3


Category 3 → train: 100%|██████████| 349/349 [00:13<00:00, 25.91it/s]
Category 3 → val: 100%|██████████| 50/50 [00:01<00:00, 25.90it/s]
Category 3 → test: 100%|██████████| 100/100 [00:05<00:00, 17.74it/s]


✅ 類別 Category 3 分割完成：Train=349, Val=50, Test=100

🔹 分割類別: Category 4


Category 4 → train: 100%|██████████| 183/183 [00:06<00:00, 28.63it/s]
Category 4 → val: 100%|██████████| 26/26 [00:00<00:00, 58.63it/s]
Category 4 → test: 100%|██████████| 52/52 [00:00<00:00, 67.30it/s]


✅ 類別 Category 4 分割完成：Train=183, Val=26, Test=52

🔹 分割類別: Category 5


Category 5 → train: 100%|██████████| 27/27 [00:00<00:00, 34.46it/s]
Category 5 → val: 100%|██████████| 4/4 [00:00<00:00, 26.56it/s]
Category 5 → test: 100%|██████████| 8/8 [00:00<00:00, 16.11it/s]


✅ 類別 Category 5 分割完成：Train=27, Val=4, Test=8

🔹 分割類別: Category 6


Category 6 → train: 100%|██████████| 49/49 [00:01<00:00, 28.78it/s]
Category 6 → val: 100%|██████████| 7/7 [00:00<00:00, 34.18it/s]
Category 6 → test: 100%|██████████| 14/14 [00:00<00:00, 32.05it/s]

✅ 類別 Category 6 分割完成：Train=49, Val=7, Test=14

🎉 所有分類皆已完成『以病人ID為單位』的資料分割！





In [None]:
# print img size
# pip install opencv-python
import cv2

img_path = "/home/stoneyew/Desktop/PACS/datasets/Category 1/XA2015110006335/XA2015110006335-2.jpg"
img = cv2.imread(img_path)
print(img.shape)  # (height, width, channels)

(2294, 1914, 3)


In [10]:
import os
import pandas as pd
from collections import defaultdict

def load_patient_descriptions():
    """
    Load patient descriptions from CATEGORY CSV files.
    Returns a dictionary mapping patient_id to description.
    """
    descriptions = {}
    base_datasets_path = "/home/stoneyew/Desktop/PACS/datasets/"
    
    # Try to load descriptions from each category CSV file
    for category_num in range(7):  # Categories 0-6
        csv_path = os.path.join(base_datasets_path, f"Category {category_num}.csv")
        if os.path.exists(csv_path):
            try:
                df = pd.read_csv(csv_path)
                # Assuming the CSV has columns like 'patient_id' and 'description'
                # Adjust column names based on actual CSV structure
                for _, row in df.iterrows():
                    patient_id = str(row.iloc[0])  # Assuming first column is patient ID
                    description = str(row.iloc[-1]) if len(row) > 1 else "No description"  # Last column as description
                    descriptions[patient_id] = description
                print(f"Loaded {len(df)} descriptions from {csv_path}")
            except Exception as e:
                print(f"Warning: Could not load descriptions from {csv_path}: {e}")
        else:
            print(f"Description file not found: {csv_path}")
    
    return descriptions

def generate_multiview_csv_from_folders(base_dir, output_csv):
    """
    直接從資料夾結構掃描影像，生成多視角模型所需的寬格式 CSV 檔案。
    and go to "/home/stoneyew/Desktop/PACS/datasets/CATEGORY #.csv" get description of each patient

    Args:
        base_dir (str): 要掃描的基礎目錄 (例如: train/ 或 val/)。
        output_csv (str): 輸出的寬格式 CSV 檔案路徑。
    """
    # Load patient descriptions
    patient_descriptions = load_patient_descriptions()
    
    # 使用 defaultdict 來讓我們可以方便地為新病人建立字典
    patients_data = defaultdict(dict)

    print(f"正在掃描目錄: {base_dir}")
    # 遍歷第一層：Category 資料夾
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        if not os.path.isdir(category_path):
            continue

        # 從 'Category X' 中提取標籤數字
        try:
            label = int(category.replace("Category ", ""))
            print(f"處理 Category {label} 資料夾")
            if label == 6:
                print(f"跳過 Category {label} 資料夾")
                continue # 跳過 Category 0 和 6 和 2
            # else:
            #     label -=1  # 調整標籤，將 Category 3,4,5 分別變成 2,3,4
            # elif label == 0 or label == 4 or label == 5:
            #     label = 1
            # else:
            #     label = 0
        except ValueError:
            print(f"警告：跳過無法識別的資料夾名稱 '{category}'")
            continue

        # 遍歷第二層：Patient 資料夾
        for patient_folder in os.listdir(category_path):
            patient_path = os.path.join(category_path, patient_folder)
            if not os.path.isdir(patient_path):
                continue
            
            # 使用相對路徑作為病人的唯一 ID
            patient_id = os.path.join(category, patient_folder)
            
            # 設定該病人的標籤
            patients_data[patient_id]['label'] = label
            
            # Add patient description
            # Extract just the patient folder name for description lookup
            patient_name = patient_folder
            description = patient_descriptions.get(patient_name, "No description available")
            patients_data[patient_id]['description'] = description

            # 遍歷第三層：影像檔案
            for img_file in os.listdir(patient_path):
                if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    # 從檔名中提取視角 (例如: 'L-MLO.jpg' -> 'L-MLO')
                    view = os.path.splitext(img_file)[0]
                    # if view == 'L-CC' :
                    #     view = 'inputs_cc'
                    # elif view == 'R-CC':
                    #     continue
                    # elif view == 'L-MLO':
                    #     view = 'inputs_mlo'
                    # elif view == 'R-MLO':
                    #     continue
                    # 儲存該視角的相對路徑
                    relative_path = os.path.join(patient_id, img_file)
                    patients_data[patient_id][view] = relative_path

    if not patients_data:
        print(f"警告：在 {base_dir} 中沒有找到任何影像資料。")
        return

    # 將字典轉換為 DataFrame
    df = pd.DataFrame.from_dict(patients_data, orient='index')

    # 重新排列欄位，確保順序正確，並包含 description
    required_columns = ['L-CC', 'R-CC', 'L-MLO', 'R-MLO', 'label', 'description']
    # required_columns =['inputs_cc', 'inputs_mlo', 'label', 'description']
    df = df.reindex(columns=required_columns)

    # 移除缺少任何一個視角的病人資料（可選，但建議）
    # Note: We don't drop rows with missing descriptions, only missing image views
    original_count = len(df)
    # Only check for missing image columns, not description
    image_columns = ['L-CC', 'R-CC', 'L-MLO', 'R-MLO']
    df_images_complete = df.dropna(subset=image_columns)
    dropped_count = original_count - len(df_images_complete)
    if dropped_count > 0:
        print(f"移除了 {dropped_count} 位缺少完整四視角影像的病人。")
    
    # Use the filtered dataframe
    df = df_images_complete

    # 儲存為 CSV
    df.to_csv(output_csv, index=False)
    print(f"多視角 CSV 檔案已儲存: {output_csv} (共 {len(df)} 位病人)")
    print("部分資料預覽：")
    print(df.head())


if __name__ == '__main__':
    # --- 請根據您的資料夾結構修改基礎路徑 ---
    base_dir = "dataset_split"

    # 為 train 資料夾產生多視角 CSV
    train_dir = os.path.join(base_dir, "train")
    output_train_csv = os.path.join(base_dir, "train_labels.csv")
    print("\n--- 正在處理 Train 資料夾 ---")
    generate_multiview_csv_from_folders(train_dir, output_train_csv)

    # 為 val 資料夾產生多視角 CSV
    val_dir = os.path.join(base_dir, "val")
    output_val_csv = os.path.join(base_dir, "val_labels.csv")
    print("\n--- 正在處理 Val 資料夾 ---")
    generate_multiview_csv_from_folders(val_dir, output_val_csv)

    # (可選) 為 test 資料夾產生多視角 CSV
    test_dir = os.path.join(base_dir, "test")
    if os.path.exists(test_dir):
        output_test_csv = os.path.join(base_dir, "test_labels.csv")
        print("\n--- 正在處理 Test 資料夾 ---")
        generate_multiview_csv_from_folders(test_dir, output_test_csv)


--- 正在處理 Train 資料夾 ---
Loaded 1197 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 0.csv
Loaded 499 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 1.csv
Loaded 499 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 2.csv
Loaded 499 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 3.csv
Loaded 262 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 4.csv
Loaded 38 descriptions from /home/stoneyew/Desktop/PACS/datasets/Category 5.csv
Description file not found: /home/stoneyew/Desktop/PACS/datasets/Category 6.csv
正在掃描目錄: dataset_split/train
處理 Category 0 資料夾
處理 Category 3 資料夾
處理 Category 1 資料夾
處理 Category 6 資料夾
跳過 Category 6 資料夾
處理 Category 4 資料夾
處理 Category 5 資料夾
處理 Category 2 資料夾
多視角 CSV 檔案已儲存: dataset_split/train_labels.csv (共 2094 位病人)
部分資料預覽：
                                                           L-CC  \
Category 0/XA2017040012532  Category 0/XA2017040012532/L-CC.jpg   
Category 0/XA2017020000760  Category 

In [None]:

import cv2
import numpy as np
import matplotlib.pyplot as plt


def preprocess_and_autocrop(image_path, threshold_offset=-50, padding_percent=0.08, min_area_ratio=0.01):
    """
    預處理並自動裁切乳房X光影像
    
    參數:
        image_path: 輸入影像路徑
        threshold_offset: 閾值偏移量（負值=更寬鬆，預設-20）
        padding_percent: 裁切邊距百分比（預設0.05=5%）
        min_area_ratio: 最小輪廓面積比例（預設0.02=2%）
    
    返回:
        original: 原始灰階影像
        enhanced: CLAHE 增強後的影像
        mask: 偵測到的乳房遮罩
        cropped: 自動裁切後的影像
    """
    
    # 讀取影像
    original = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if original is None:
        raise ValueError(f"無法讀取影像: {image_path}")
    
    # 1. CLAHE 對比度增強
    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
    enhanced = clahe.apply(original)
    
    # 2. 影像預處理
    blurred = cv2.GaussianBlur(enhanced, (5, 5), 0)
    
    # 3. 二值化 - 使用更低的閾值來包含暗區（如乳頭）
    # 先用 Otsu 得到基準閾值
    otsu_thresh, _ = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # 大幅降低閾值以包含乳頭等暗區
    lower_thresh = max(5, otsu_thresh + threshold_offset)
    _, binary = cv2.threshold(blurred, lower_thresh, 255, cv2.THRESH_BINARY)
    
    # 4. 形態學處理 - 更激進的閉運算
    kernel_large = np.ones((11, 11), np.uint8)
    kernel_small = np.ones((5, 5), np.uint8)
    
    # 更多次閉運算，填補更多區域
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_large, iterations=5)
    # 減少開運算次數，保留更多區域
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel_small, iterations=1)
    
    # 5. 尋找輪廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # 6. 過濾並選擇最大輪廓 - 降低最小面積要求
    min_area = original.shape[0] * original.shape[1] * min_area_ratio
    valid_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_area]
    
    if len(valid_contours) == 0:
        raise ValueError("未找到有效輪廓")
    
    largest_contour = max(valid_contours, key=cv2.contourArea)
    
    # 7. 計算凸包，包含更完整的區域
    hull = cv2.convexHull(largest_contour)
    
    # 8. 創建遮罩 - 使用凸包確保包含完整乳房區域
    mask = np.zeros_like(original)
    cv2.drawContours(mask, [hull], -1, 255, thickness=cv2.FILLED)
    
    # 9. 額外處理：填補內部孔洞（如乳頭區域）
    # 尋找所有內部輪廓
    contours_all, hierarchy = cv2.findContours(mask.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
    # 填補所有內部孔洞
    for i in range(len(contours_all)):
        if hierarchy[0][i][3] != -1:  # 如果有父輪廓（即內部孔洞）
            cv2.drawContours(mask, contours_all, i, 255, thickness=cv2.FILLED)
    
    # 10. 擴張遮罩邊緣（更溫和的擴張以保留細節）
    kernel_dilate = np.ones((15, 15), np.uint8)
    mask = cv2.dilate(mask, kernel_dilate, iterations=3)
    
    # 10. 羽化邊緣
    mask_blurred = cv2.GaussianBlur(mask, (21, 21), 0)
    
    # 11. 獲取邊界框進行裁切
    x, y, w, h = cv2.boundingRect(hull)
    
    # 添加更大的邊距（基於影像尺寸的百分比）
    padding_x = int(original.shape[1] * padding_percent)
    padding_y = int(original.shape[0] * padding_percent)
    
    x = max(0, x - padding_x)
    y = max(0, y - padding_y)
    w = min(original.shape[1] - x, w + 2 * padding_x)
    h = min(original.shape[0] - y, h + 2 * padding_y)
    
    # 12. 裁切影像和遮罩
    cropped = enhanced[y:y+h, x:x+w]

    return original, enhanced, mask, cropped


def display_results(original, enhanced, mask, cropped):
    """
    顯示處理結果
    """
    plt.figure(figsize=(15, 6))
    
    plt.subplot(1, 4, 1)
    plt.title("Original Image")
    plt.imshow(original, cmap='gray')
    plt.axis('off')
    
    plt.subplot(1, 4, 2)
    plt.title("After CLAHE")
    plt.imshow(enhanced, cmap='gray')
    plt.axis('off')
    
    plt.subplot(1, 4, 3)
    plt.title("Detected Mask")
    plt.imshow(mask, cmap='gray')
    plt.axis('off')
    
    plt.subplot(1, 4, 4)
    plt.title("Auto-cropped Image")
    plt.imshow(cropped, cmap='gray')
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()


# === 使用範例 ===
if __name__ == "__main__":
    # === 測試 ===
    image_path = "datasets/test/Category 0/XA2016010003906/R-CC.jpg"
    
    try:    
        original, enhanced, mask, cropped = preprocess_and_autocrop(
            image_path,
            threshold_offset=-50,    # 更寬鬆的閾值（預設-20）
            padding_percent=0.08,    # 更大的邊距（預設0.05=5%）
            min_area_ratio=0.01      # 更低的最小面積（預設0.02=2%）
        )
        
        # === 顯示結果 ===
        plt.figure(figsize=(15, 6))
        plt.subplot(1, 4, 1); plt.title("Original Image"); plt.imshow(original, cmap='gray'); plt.axis('off')
        plt.subplot(1, 4, 2); plt.title("After CLAHE"); plt.imshow(enhanced, cmap='gray'); plt.axis('off')
        plt.subplot(1, 4, 3); plt.title("Detected Mask"); plt.imshow(mask, cmap='gray'); plt.axis('off')
        plt.subplot(1, 4, 4); plt.title("Auto-cropped Image"); plt.imshow(cropped, cmap='gray'); plt.axis('off')
        plt.tight_layout(); plt.show()
        
        # 或使用封裝好的函數
        # display_results(original, enhanced, mask, cropped)
        
        # 輸出統計資訊
        print(f"原始影像尺寸: {original.shape}")
        print(f"裁切後影像尺寸: {cropped.shape}")
        print(f"有效像素數: {np.sum(mask > 0)}")
        print(f"遮罩覆蓋率: {np.sum(mask > 0) / (mask.shape[0] * mask.shape[1]) * 100:.2f}%")
        
    except Exception as e:
        print(f"錯誤: {e}")

In [12]:
import os
import cv2
import numpy as np

def save_cropped_images(cropped, image_path, output_root):
    """
    保存裁切後的影像，並維持與原始資料夾相同的結構。
    """
    relative_path = os.path.relpath(image_path, start="datasets")
    output_dir = os.path.join(output_root, os.path.dirname(relative_path))
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    cv2.imwrite(os.path.join(output_dir, f"{base_name}.jpg"), cropped)

def process_dataset(input_root, output_root):
    """
    遍歷資料夾中的所有影像，進行預處理並保存結果。
    """
    for root, _, files in os.walk(input_root):
        for file in files:
            if file.lower().endswith(('.jpg', '.png', '.jpeg', '.bmp', '.tiff')):
                image_path = os.path.join(root, file)
                try:
                    original, enhanced, mask, cropped = preprocess_and_autocrop(image_path)
                    save_cropped_images(cropped, image_path, output_root)
                    print(f"處理完成: {image_path}")
                except Exception as e:
                    print(f"處理失敗: {image_path}, 錯誤: {e}")

# === 主程式 ===
if __name__ == "__main__":
    input_root = "datasets"  # 原始資料夾
    output_root = "cropped_datasets"  # 預處理後的資料夾
    process_dataset(input_root, output_root)

處理完成: datasets/test/Category 0/XA2017060007953/L-MLO.jpg
處理完成: datasets/test/Category 0/XA2017060007953/L-CC.jpg
處理完成: datasets/test/Category 0/XA2017060007953/R-MLO.jpg
處理完成: datasets/test/Category 0/XA2017060007953/R-CC.jpg
處理完成: datasets/test/Category 0/XA2017070017242/L-MLO.jpg
處理完成: datasets/test/Category 0/XA2017070017242/L-CC.jpg
處理完成: datasets/test/Category 0/XA2017070017242/R-MLO.jpg
處理完成: datasets/test/Category 0/XA2017070017242/R-CC.jpg
處理完成: datasets/test/Category 0/XA2018060001873/L-MLO.jpg
處理完成: datasets/test/Category 0/XA2018060001873/L-CC.jpg
處理完成: datasets/test/Category 0/XA2018060001873/R-MLO.jpg
處理完成: datasets/test/Category 0/XA2018060001873/R-CC.jpg
處理完成: datasets/test/Category 0/XA2017100011698/L-MLO.jpg
處理完成: datasets/test/Category 0/XA2017100011698/L-CC.jpg
處理完成: datasets/test/Category 0/XA2017100011698/R-MLO.jpg
處理完成: datasets/test/Category 0/XA2017100011698/R-CC.jpg
處理完成: datasets/test/Category 0/XA2016010006769/L-MLO.jpg
處理完成: datasets/test/Category 0/XA201601