# AMOS22 Dataset Explorer (Folder Structure & File Inventory)
This notebook scans the AMOS22 dataset directory and summarizes:
- folder/subfolder structure
- file counts by extension and by subfolder
- example filenames
- optional: parses `dataset.json` if present

**Tip:** Start by setting `DATA_ROOT` below to your local AMOS22 path.


In [1]:
# Cell 1 — Setup
from pathlib import Path
import os
import json
from collections import Counter, defaultdict
from datetime import datetime

# TODO: set your local dataset root
DATA_ROOT = Path(r"C:/Users/hyeon/Documents/miniconda_medimg_env/data/amos22")

assert DATA_ROOT.exists(), f"DATA_ROOT does not exist: {DATA_ROOT}"
print("DATA_ROOT:", DATA_ROOT.resolve())
print("Scan time :", datetime.now().isoformat(timespec="seconds"))

DATA_ROOT: C:\Users\hyeon\Documents\miniconda_medimg_env\data\amos22
Scan time : 2026-02-04T09:34:47


In [2]:
# Cell 2 — High-level directory tree (depth-limited)
def iter_tree(root: Path, max_depth: int = 3):
    root = root.resolve()
    out = []
    for p in sorted(root.rglob("*")):
        rel = p.relative_to(root)
        depth = len(rel.parts)
        if depth <= max_depth:
            out.append((rel.as_posix(), "dir" if p.is_dir() else "file"))
    return out

MAX_DEPTH = 4  # increase if needed
tree = iter_tree(DATA_ROOT, max_depth=MAX_DEPTH)

print(f"Items within depth <= {MAX_DEPTH}: {len(tree)}")
for rel, kind in tree[:200]:
    print(f"{kind:4s}  {rel}")
if len(tree) > 200:
    print("... (truncated; increase preview or export to file below)")

Items within depth <= 4: 970
file  dataset.json
dir   imagesTr
file  imagesTr/.DS_Store
file  imagesTr/amos_0001.nii.gz
file  imagesTr/amos_0004.nii.gz
file  imagesTr/amos_0005.nii.gz
file  imagesTr/amos_0006.nii.gz
file  imagesTr/amos_0007.nii.gz
file  imagesTr/amos_0009.nii.gz
file  imagesTr/amos_0010.nii.gz
file  imagesTr/amos_0011.nii.gz
file  imagesTr/amos_0014.nii.gz
file  imagesTr/amos_0015.nii.gz
file  imagesTr/amos_0016.nii.gz
file  imagesTr/amos_0017.nii.gz
file  imagesTr/amos_0019.nii.gz
file  imagesTr/amos_0021.nii.gz
file  imagesTr/amos_0023.nii.gz
file  imagesTr/amos_0024.nii.gz
file  imagesTr/amos_0025.nii.gz
file  imagesTr/amos_0027.nii.gz
file  imagesTr/amos_0030.nii.gz
file  imagesTr/amos_0033.nii.gz
file  imagesTr/amos_0035.nii.gz
file  imagesTr/amos_0036.nii.gz
file  imagesTr/amos_0038.nii.gz
file  imagesTr/amos_0042.nii.gz
file  imagesTr/amos_0043.nii.gz
file  imagesTr/amos_0044.nii.gz
file  imagesTr/amos_0045.nii.gz
file  imagesTr/amos_0047.nii.gz
file  imagesTr/a

In [3]:
# Cell 3 — Full inventory: counts by extension, total sizes, and top folders
def human_bytes(n: int) -> str:
    units = ["B", "KB", "MB", "GB", "TB"]
    f = float(n)
    for u in units:
        if f < 1024 or u == units[-1]:
            return f"{f:.2f} {u}"
        f /= 1024

ext_counter = Counter()
size_counter = Counter()
folder_file_counter = Counter()

all_files = []
for p in DATA_ROOT.rglob("*"):
    if p.is_file():
        ext = "".join(p.suffixes).lower() if p.suffixes else "<no_ext>"
        try:
            sz = p.stat().st_size
        except OSError:
            sz = 0
        ext_counter[ext] += 1
        size_counter[ext] += sz
        # top-level folder group (first component under DATA_ROOT)
        rel = p.relative_to(DATA_ROOT)
        top = rel.parts[0] if rel.parts else "<root>"
        folder_file_counter[top] += 1
        all_files.append(p)

print("Total files:", len(all_files))
print("\nFile types (by extension) — counts:")
for ext, c in ext_counter.most_common(30):
    print(f"{ext:12s}  {c:7d}   ({human_bytes(size_counter[ext])})")
if len(ext_counter) > 30:
    print("... (truncated)")

print("\nTop-level folder file counts:")
for k, v in folder_file_counter.most_common():
    print(f"{k:20s} {v:7d}")

Total files: 964

File types (by extension) — counts:
.nii.gz           960   (22.59 GB)
.json               1   (39.42 KB)
.csv                1   (34.38 KB)
.md                 1   (2.69 KB)
<no_ext>            1   (22.00 KB)

Top-level folder file counts:
imagesTr                 241
imagesTs                 240
labelsTr                 240
imagesVa                 120
labelsVa                 120
dataset.json               1
labeled_data_meta_0000_0599.csv       1
readme.md                  1


In [4]:
# Cell 4 — Detailed subfolder structure summary (directories only)
dir_counts = []
for d in sorted([p for p in DATA_ROOT.rglob("*") if p.is_dir()]):
    rel = d.relative_to(DATA_ROOT)
    # count files directly in this folder (not recursive)
    try:
        direct_files = sum(1 for x in d.iterdir() if x.is_file())
    except OSError:
        direct_files = 0
    dir_counts.append((rel.as_posix(), direct_files))

print("Directories:", len(dir_counts))
for rel, nfiles in dir_counts[:200]:
    print(f"{nfiles:5d} files  {rel}")
if len(dir_counts) > 200:
    print("... (truncated)")

Directories: 6
  241 files  imagesTr
  240 files  imagesTs
  120 files  imagesVa
  240 files  labelsTr
    0 files  labelsTs
  120 files  labelsVa


In [5]:
# Cell 5 — Spot-check: list examples from key folders (if present)
# Adjust these folder names if your dataset uses different ones.
candidates = [
    "imagesTr", "labelsTr",
    "imagesVa", "labelsVa",
    "imagesTs", "labelsTs",
    "raw", "Processed", "processed", "images", "labels",
]

for name in candidates:
    d = DATA_ROOT / name
    if d.exists() and d.is_dir():
        files = sorted([p.name for p in d.iterdir() if p.is_file()])[:20]
        print(f"\n{name}/  exists ✅  (showing up to 20 files)")
        for fn in files:
            print("  ", fn)


imagesTr/  exists ✅  (showing up to 20 files)
   .DS_Store
   amos_0001.nii.gz
   amos_0004.nii.gz
   amos_0005.nii.gz
   amos_0006.nii.gz
   amos_0007.nii.gz
   amos_0009.nii.gz
   amos_0010.nii.gz
   amos_0011.nii.gz
   amos_0014.nii.gz
   amos_0015.nii.gz
   amos_0016.nii.gz
   amos_0017.nii.gz
   amos_0019.nii.gz
   amos_0021.nii.gz
   amos_0023.nii.gz
   amos_0024.nii.gz
   amos_0025.nii.gz
   amos_0027.nii.gz
   amos_0030.nii.gz

labelsTr/  exists ✅  (showing up to 20 files)
   amos_0001.nii.gz
   amos_0004.nii.gz
   amos_0005.nii.gz
   amos_0006.nii.gz
   amos_0007.nii.gz
   amos_0009.nii.gz
   amos_0010.nii.gz
   amos_0011.nii.gz
   amos_0014.nii.gz
   amos_0015.nii.gz
   amos_0016.nii.gz
   amos_0017.nii.gz
   amos_0019.nii.gz
   amos_0021.nii.gz
   amos_0023.nii.gz
   amos_0024.nii.gz
   amos_0025.nii.gz
   amos_0027.nii.gz
   amos_0030.nii.gz
   amos_0033.nii.gz

imagesVa/  exists ✅  (showing up to 20 files)
   amos_0008.nii.gz
   amos_0013.nii.gz
   amos_0018.nii.gz
   amo

In [6]:
# Cell 6 — Optional: Parse dataset.json (if present)
ds_json = None
for cand in [DATA_ROOT / "dataset.json", DATA_ROOT / "Dataset" / "dataset.json"]:
    if cand.exists():
        ds_json = cand
        break

if ds_json is None:
    print("dataset.json not found under DATA_ROOT (searched common locations).")
else:
    print("Found:", ds_json)
    with open(ds_json, "r", encoding="utf-8") as f:
        ds = json.load(f)
    # Print selected keys if present
    for k in ["name", "description", "tensorImageSize", "modality", "labels", "numTraining", "numTest", "license"]:
        if k in ds:
            v = ds[k]
            if isinstance(v, dict) and len(v) > 30:
                print(f"{k}: dict with {len(v)} keys (printing first 10):")
                for i, (kk, vv) in enumerate(v.items()):
                    print(f"  {kk}: {vv}")
                    if i >= 9:
                        break
            else:
                print(f"{k}: {v}")

Found: C:\Users\hyeon\Documents\miniconda_medimg_env\data\amos22\dataset.json
name: AMOS
description: Amos: A large-scale abdominal multi-organ benchmark for versatile medical image segmentation
tensorImageSize: 3D
modality: {'0': 'CT'}
labels: {'0': 'background', '1': 'spleen', '2': 'right kidney', '3': 'left kidney', '4': 'gall bladder', '5': 'esophagus', '6': 'liver', '7': 'stomach', '8': 'arota', '9': 'postcava', '10': 'pancreas', '11': 'right adrenal gland', '12': 'left adrenal gland', '13': 'duodenum', '14': 'bladder', '15': 'prostate/uterus'}
numTraining: 240
numTest: 240


In [7]:
# Cell 7 — Export scan summaries to CSV/JSON (so you can paste/attach results)
import pandas as pd

out_dir = Path("outputs_dataset_scan")
out_dir.mkdir(parents=True, exist_ok=True)

# 1) extension summary
ext_df = pd.DataFrame(
    [{"ext": ext, "count": int(ext_counter[ext]), "total_bytes": int(size_counter[ext])} for ext in ext_counter]
).sort_values(["count", "total_bytes"], ascending=[False, False])
ext_csv = out_dir / "filetypes_by_extension.csv"
ext_df.to_csv(ext_csv, index=False)

# 2) top-level folder summary
top_df = pd.DataFrame(
    [{"top_folder": k, "file_count": int(v)} for k, v in folder_file_counter.items()]
).sort_values("file_count", ascending=False)
top_csv = out_dir / "filecounts_by_topfolder.csv"
top_df.to_csv(top_csv, index=False)

# 3) directory direct-file counts
dir_df = pd.DataFrame(dir_counts, columns=["directory", "direct_file_count"]).sort_values("direct_file_count", ascending=False)
dir_csv = out_dir / "directories_direct_file_counts.csv"
dir_df.to_csv(dir_csv, index=False)

# 4) a lightweight JSON manifest (paths only; no PHI; no file content)
manifest = {
    "data_root": str(DATA_ROOT),
    "total_files": len(all_files),
    "extensions": {k: int(v) for k, v in ext_counter.items()},
    "top_level_folders": {k: int(v) for k, v in folder_file_counter.items()},
    "max_depth_preview": MAX_DEPTH,
    "tree_preview_first_200": [{"kind": kind, "path": rel} for rel, kind in tree[:200]],
}
manifest_json = out_dir / "scan_manifest.json"
manifest_json.write_text(json.dumps(manifest, indent=2), encoding="utf-8")

print("Wrote:")
print(" -", ext_csv.resolve())
print(" -", top_csv.resolve())
print(" -", dir_csv.resolve())
print(" -", manifest_json.resolve())
print("\nYou can paste the printed summaries here, or attach these CSV/JSON outputs.")

Wrote:
 - C:\Users\hyeon\Documents\miniconda_medimg_env\abdomen-multiorgan-segmentation\outputs_dataset_scan\filetypes_by_extension.csv
 - C:\Users\hyeon\Documents\miniconda_medimg_env\abdomen-multiorgan-segmentation\outputs_dataset_scan\filecounts_by_topfolder.csv
 - C:\Users\hyeon\Documents\miniconda_medimg_env\abdomen-multiorgan-segmentation\outputs_dataset_scan\directories_direct_file_counts.csv
 - C:\Users\hyeon\Documents\miniconda_medimg_env\abdomen-multiorgan-segmentation\outputs_dataset_scan\scan_manifest.json

You can paste the printed summaries here, or attach these CSV/JSON outputs.


In [None]:
# Cell 8 — If you need more dataset info (fill in after you inspect results)
# Examples:
# - verify that each image has a matching label (by stem)
# - check label max/min values across labelsTr
# - confirm spacing/orientation for a few cases
#
# Paste requests/questions below, run, and then share outputs.

# YOUR NOTES / QUESTIONS: