In [1]:
import csv, glob, os, re

base = "../outputs/ce_mri_images"
rows = []

for lbl in os.listdir(base):
    p = os.path.join(base, lbl)
    if not os.path.isdir(p): 
        continue
    
    for f in glob.glob(os.path.join(p, "*.png")):
        fname = os.path.basename(f)
        
        # Extract PID from filename (format: pid{PID}_originalname.png)
        # PID can be numeric (e.g., pid100360_1.png) or alphanumeric (e.g., pidMR0402480D_2376.png)
        pid_match = re.match(r'pid([^_]+)_(.+)\.png', fname)
        if pid_match:
            pid = pid_match.group(1)  # Keep as string (can be numeric or alphanumeric)
            orig_name = pid_match.group(2)
        else:
            pid = None
            orig_name = os.path.splitext(fname)[0]
        
        rows.append([fname, lbl, pid, orig_name])

# Save metadata CSV to outputs root (better organization)
metadata_path = "../outputs/metadata.csv"
with open(metadata_path, "w", newline="") as fh:
    writer = csv.writer(fh)
    writer.writerow(["filename", "label", "patient_id", "original_mat_name"])
    writer.writerows(rows)

print(f"Saved metadata to: {metadata_path}")
print(f"Total images: {len(rows)}")
print(f"Unique patients: {len(set(r[2] for r in rows if r[2] is not None))}")
print(f"Images without PID: {sum(1 for r in rows if r[2] is None)}")

# Show a few sample rows
if rows:
    print("\nSample entries:")
    for row in rows[:5]:
        print(f"  {row}")

Saved metadata to: ../outputs/metadata.csv
Total images: 3064
Unique patients: 233
Images without PID: 0

Sample entries:
  ['pid101029_1465.png', '3', '101029', '1465']
  ['pid105187_1561.png', '3', '105187', '1561']
  ['pid101017_1643.png', '3', '101017', '1643']
  ['pid101145_1652.png', '3', '101145', '1652']
  ['pid111075_1233.png', '3', '111075', '1233']
