In [3]:
import os
from pathlib import Path

from PIL import Image

# Optional: nice progress bars
try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x, **kwargs: x  # fallback if tqdm not installed


def process_covid_xray_dataset(
    raw_root,
    processed_root,
    img_size=64,
    valid_exts=(".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"),
):
    """
    Walk through raw_root, convert all images to grayscale 64x64,
    and save them under processed_root with the same subfolder structure.
    """
    raw_root = Path(raw_root)
    processed_root = Path(processed_root)

    if not raw_root.exists():
        raise FileNotFoundError(f"raw_root does not exist: {raw_root}")

    print(f"Raw dataset root      : {raw_root}")
    print(f"Processed dataset root: {processed_root}")
    processed_root.mkdir(parents=True, exist_ok=True)

    # Collect all image paths
    image_paths = []
    for root, _, files in os.walk(raw_root):
        for fname in files:
            if fname.lower().endswith(valid_exts):
                image_paths.append(Path(root) / fname)

    print(f"Found {len(image_paths)} image files to process.")

    for img_path in tqdm(image_paths, desc="Processing images"):
        # Compute relative path w.r.t. raw_root
        rel_path = img_path.relative_to(raw_root)

        # Where to save processed file
        out_path = processed_root / rel_path
        out_path.parent.mkdir(parents=True, exist_ok=True)

        # Load, convert, resize
        try:
            with Image.open(img_path) as img:
                # Convert to grayscale ('L')
                img = img.convert("L")
                # Resize
                img = img.resize((img_size, img_size), resample=Image.BILINEAR)
                # Save as PNG (or keep original suffix if you prefer)
                # Here I keep original suffix:
                img.save(out_path)
        except Exception as e:
            print(f"Warning: failed to process {img_path}: {e}")


# ------------------------------------------------------------------
# Example usage in your Colab environment
# ------------------------------------------------------------------

if __name__ == "__main__":
    # Adjust these paths to your actual folder names
    RAW_ROOT = os.path.join(os.getcwd(), 'Data') 
    PROCESSED_ROOT = os.path.join(os.getcwd(), 'COVID_XRay_64_gray')

    process_covid_xray_dataset(
        raw_root=RAW_ROOT,
        processed_root=PROCESSED_ROOT,
        img_size=64,
    )

Raw dataset root      : /Users/linzhao/Desktop/Semi-synthetic Data/Data
Processed dataset root: /Users/linzhao/Desktop/Semi-synthetic Data/COVID_XRay_64_gray
Found 6432 image files to process.


Processing images: 100%|████████████████████| 6432/6432 [01:05<00:00, 98.86it/s]


In [2]:
import os
import torch
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

root = os.path.join(os.getcwd(), 'COVID_XRay_64_gray')

# final output
save_path = os.path.join(os.getcwd(), 'COVID_XRay_64_gray', 'processed_covid_images.pt')

# transform: convert to float32 tensor in range [-1,1] or [0,1]
to_tensor = transforms.Compose([
    transforms.ToTensor(),   # already grayscale and 64x64
])

images = []
paths  = []

for split in ["train", "test"]:
    split_path = os.path.join(root, split)
    for subtype in ["COVID19", "NORMAL", "PNEUMONIA"]:
        class_dir = os.path.join(split_path, subtype)
        if not os.path.exists(class_dir):
            continue

        for fname in tqdm(os.listdir(class_dir), desc=f"Loading {split}/{subtype}"):
            if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
                continue

            img_path = os.path.join(class_dir, fname)

            # load grayscale (it’s already grayscale, but this is safe)
            img = Image.open(img_path).convert("L")

            # convert to tensor
            img = to_tensor(img)  # shape (1, 64, 64)
            
            images.append(img)
            paths.append(img_path)

# stack into a single tensor
X_real = torch.stack(images)  # (N, 1, 64, 64)
print("Final tensor shape:", X_real.shape)

# save file
torch.save({"images": X_real, "paths": paths}, save_path)
print("Saved to:", save_path)

Loading train/COVID19: 100%|████████████████| 460/460 [00:00<00:00, 5883.26it/s]
Loading train/NORMAL: 100%|███████████████| 1266/1266 [00:00<00:00, 5934.03it/s]
Loading train/PNEUMONIA: 100%|████████████| 3418/3418 [00:00<00:00, 7027.18it/s]
Loading test/COVID19: 100%|█████████████████| 116/116 [00:00<00:00, 5546.00it/s]
Loading test/NORMAL: 100%|██████████████████| 317/317 [00:00<00:00, 6281.09it/s]
Loading test/PNEUMONIA: 100%|███████████████| 855/855 [00:00<00:00, 7285.39it/s]


Final tensor shape: torch.Size([6432, 1, 64, 64])
Saved to: /Users/linzhao/Desktop/Semi-synthetic Data/processed_covid_images.pt
