In [8]:
from deblur3d.data import MultiPageTiffDataset

ds = MultiPageTiffDataset(
    manifest_path=r"T:\users\taki\Dataset_L\index_single_8bit.xlsx",  # or .parquet/.csv
    patch_size=(96,128,128),
    filter_query=None,        # optional
    random_subset=0.8,                 # optional
)

sharp, blurred = ds[0]
print(sharp.shape, sharp.min().item(), sharp.max().item())


torch.Size([96, 128, 128]) 0.0 1.0


In [6]:
import os
import numpy as np
import pandas as pd

def _bpp_from_dtype(dtype_str: str) -> int:
    s = str(dtype_str).lower()
    if   "uint8" in s or "int8" in s:   return 1
    elif "uint16" in s or "int16" in s: return 2
    elif "float32" in s or "single" in s or "uint32" in s or "int32" in s: return 4
    elif "float64" in s or "double" in s or "uint64" in s or "int64" in s: return 8
    # default fallback
    return 1

def inspect_mp_tiff_dataset(ds, check_disk=True):
    """
    ds: MultiPageTiffDataset instance
    """
    # Build a DF from ds.vols (VolumeInfo objects)
    rows = []
    for v in ds.vols:
        bpp = _bpp_from_dtype(getattr(v, "dtype_str", "uint8"))
        voxels = int(v.n_slices) * int(v.H) * int(v.W)
        est_gb = (voxels * bpp) / 1e9
        rows.append({
            "path": str(v.tif_path),
            "D": int(v.n_slices), "H": int(v.H), "W": int(v.W),
            "dtype": getattr(v, "dtype_str", "uint8"),
            "bpp": bpp,
            "est_size_gb": est_gb,
        })
    df = pd.DataFrame(rows)

    # Overall estimated stats
    n = len(df)
    total_est = float(df["est_size_gb"].sum())
    mean_est  = float(df["est_size_gb"].mean()) if n else np.nan
    std_est   = float(df["est_size_gb"].std(ddof=1)) if n >= 2 else np.nan

    result = {
        "volumes_count": n,
        "estimated": {
            "total_gb": round(total_est, 4),
            "mean_gb":  (None if np.isnan(mean_est) else round(mean_est, 4)),
            "std_gb":   (None if np.isnan(std_est)  else round(std_est, 4)),
        }
    }

    # Optional: on-disk stats from actual files
    if check_disk:
        def _file_gb(p):
            try:
                return os.stat(p).st_size / 1e9
            except Exception:
                return np.nan
        df["ondisk_gb"] = df["path"].map(_file_gb)

        valid = df["ondisk_gb"].notna()
        n_disk = int(valid.sum())
        if n_disk:
            total_disk = float(df.loc[valid, "ondisk_gb"].sum())
            mean_disk  = float(df.loc[valid, "ondisk_gb"].mean())
            std_disk   = float(df.loc[valid, "ondisk_gb"].std(ddof=1)) if n_disk >= 2 else np.nan
            result["ondisk"] = {
                "volumes_with_files": n_disk,
                "total_gb": round(total_disk, 4),
                "mean_gb":  round(mean_disk, 4),
                "std_gb":   (None if np.isnan(std_disk) else round(std_disk, 4)),
            }
        else:
            result["ondisk"] = None

    # Pretty print
    print(f"Volumes: {result['volumes_count']}")
    print(f"Estimated (from voxels): total={result['estimated']['total_gb']} GB, "
          f"mean={result['estimated']['mean_gb']} GB, std={result['estimated']['std_gb']}")
    if check_disk and result.get("ondisk"):
        od = result["ondisk"]
        print(f"On-disk: total={od['total_gb']} GB, mean={od['mean_gb']} GB, std={od['std_gb']} "
              f"(from {od['volumes_with_files']} files)")
    return df, result


In [9]:
# assuming you already have:
# ds = MultiPageTiffDataset(manifest_path=..., ...)

df_summary, stats = inspect_mp_tiff_dataset(ds, check_disk=True)

# peek at the first few volumes with their sizes
df_summary.head()[["path","D","H","W","dtype","est_size_gb"]]


Volumes: 374
Estimated (from voxels): total=54.3543 GB, mean=0.1453 GB, std=0.1217
On-disk: total=54.378 GB, mean=0.1454 GB, std=0.1217 (from 374 files)


Unnamed: 0,path,D,H,W,dtype,est_size_gb
0,T:\users\taki\Dataset_L\S0437.tif,138,1453,1320,uint8,0.264678
1,T:\users\taki\Dataset_L\S0341.tif,302,345,282,uint8,0.029382
2,T:\users\taki\Dataset_L\S0135.tif,310,311,323,uint8,0.03114
3,T:\users\taki\Dataset_L\S0091.tif,548,619,733,uint8,0.248642
4,T:\users\taki\Dataset_L\S0379.tif,169,1023,1568,uint8,0.271087


In [10]:
import hashlib
import pandas as pd
from pathlib import Path

def stable_hash(x: str) -> int:
    return int(hashlib.md5(x.encode("utf-8")).hexdigest(), 16)

def make_train_val_hash(manifest_path, out_path, group_col="group_id", val_frac=0.2):
    # load
    p = Path(manifest_path)
    if p.suffix.lower() == ".parquet":
        df = pd.read_parquet(p)
    elif p.suffix.lower() in (".csv",):
        df = pd.read_csv(p)
    else:
        df = pd.read_excel(p)

    # derive a grouping key if you don't have one
    if group_col not in df.columns:
        # example: group by parent directory of output_tif
        df["group_id"] = df["output_tif"].astype(str).apply(lambda s: str(Path(s).parent))
        group_col = "group_id"

    # deterministic hash in [0,1)
    groups = df[group_col].astype(str).unique()
    g2r = {g: (stable_hash(g) % 10_000_000) / 10_000_000.0 for g in groups}

    # assign split by group
    df["split"] = df[group_col].map(lambda g: "val" if g2r[g] < val_frac else "train")

    # optional: sanity check balance on ds_factor (if present)
    if "ds_factor" in df.columns:
        print(df.groupby(["split","ds_factor"]).size())

    df.to_parquet(out_path) if out_path.endswith(".parquet") else df.to_csv(out_path, index=False)
    return df


In [18]:
# A) deterministic hash split
df_split = make_train_val_hash(
    manifest_path=r"T:\users\taki\Dataset_L\index_single_8bit.xlsx",
    out_path=r"T:\users\taki\Dataset_L\index_with_split.parquet",
    group_col="project_path",    # or 'project_path' if you kept it
    val_frac=0.30
)

split  ds_factor
train  1            188
       2             65
       4            165
val    1              4
       2              3
       4             42
dtype: int64
