In [None]:
import os, os.path as osp, glob
import numpy as np
# import nibabel as nib
import medpy.io as medio
# import SimpleITK as sitk
# import itk
# import matplotlib.pyplot as plt
# %matplotlib inline

In [None]:
P = osp.expanduser("~/data/ctpelvic1k")

# Annotations
Download from [CTPelvic1K Dataset](https://zenodo.org/records/4588403#.YEyLq_0zaCo). Unzip:
- CTPelvic1K_dataset1_mask_mappingback.tar.gz -> *dataset1_mask_mappingback/*
- CTPelvic1K_dataset2_mask_mappingback.tar.gz -> *dataset2_mask_mappingback/*
- CTPelvic1K_dataset3_mask_mappingback.tar.gz -> *dataset3_mask_mappingback/*
- CTPelvic1K_dataset4_mask_mappingback.tar.gz -> *dataset4_mask_mappingback/*
- CTPelvic1K_dataset5_mask_mappingback.tar.gz -> *dataset5_mask_mappingback/*
- CTPelvic1K_dataset6_Anonymized_mask.tar.gz  -> *dataset6_mask_mappingback/*
- CTPelvic1K_dataset7_mask_mappingback.tar.gz -> *dataset7_mask_mappingback/*

# 1 [Abdomen](https://www.synapse.org/#!Synapse:syn3376386)

Four zip files:
- Abdomen.zip
- RawData.zip
- Reg-Training-Testing.zip
- Reg-Training-Training.zip

Only use *RawData.zip*. Unzip to *dataset1_abdomen/RawData/*.

In [None]:
data_p = osp.join(P, "dataset1_abdomen/RawData")
label_p = osp.join(P, "dataset1_mask_mappingback")
cnt = 0
for subset in os.listdir(data_p):
    print('\t', subset)
    subset_p = osp.join(data_p, subset, "img")
    for f in os.listdir(subset_p):
        # img0001.nii.gz, dataset1_img0001_mask_4label.nii.gz
        fid = f[:-7]
        lab_f = osp.join(label_p, f"dataset1_{fid}_mask_4label.nii.gz")
        if osp.isfile(lab_f):
            print(f, end='\r')
            img, _ = medio.load(osp.join(subset_p, f))
            lab, _ = medio.load(lab_f)
            # print(img.shape, img.dtype, lab.shape, lab.dtype, np.unique(lab))
            assert img.shape == lab.shape, f"* diff shape: {img.shape} v.s. {lab.shape}"
            cnt += 1
print("\ntotal:", cnt, len(os.listdir(label_p)))

# 2 [ACRIN 6664 (CT COLONOGRAPHY)](https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=3539213)
- (2023.12.4) data not downloaded yet

# 3 [Task10_Colon.tar](https://drive.google.com/file/d/1m7tMpE9qEcQGQjL_BdMD-Mvgmc44hG1Y/view)
Upzip to *dataset3_msd-t10/*.

In [None]:
data_p = osp.join(P, "dataset3_msd-t10")
label_p = osp.join(P, "dataset3_mask_mappingback")
cnt = 0
for subset in ("imagesTr", "imagesTs"):
    print('\t', subset)
    subset_p = osp.join(data_p, subset)
    for f in os.listdir(subset_p):
        # colon_001.nii.gz, dataset3_colon_001_mask_4label.nii.gz
        fid = f[:-7]
        lab_f = osp.join(label_p, f"dataset3_{fid}_mask_4label.nii.gz")
        if osp.isfile(lab_f):
            print(f, end='\r')
            img, _ = medio.load(osp.join(subset_p, f))
            lab, _ = medio.load(lab_f)
            assert img.shape == lab.shape, f"* diff shape: {img.shape} v.s. {lab.shape}"
            cnt += 1
print("\ntotal:", cnt, len(os.listdir(label_p)))

# 4 [neheller/kits19](https://github.com/neheller/kits19)
Following [Usage](https://github.com/neheller/kits19#usage) to download (using *kits19/starter_code/get_imaging_v2.py*) data into *dataset4_kits19/*.

In [None]:
data_p = osp.join(P, "dataset4_kits19")
label_p = osp.join(P, "dataset4_mask_mappingback")
cnt = 0
for fid in os.listdir(data_p):
    # case_00014/imaging.nii.gz, dataset4_case_00014_mask_4label.nii.gz
    img_f = osp.join(data_p, fid, "imaging.nii.gz")
    assert osp.isfile(img_f), img_f
    lab_f = osp.join(label_p, f"dataset4_{fid}_mask_4label.nii.gz")
    if osp.isfile(lab_f):
        print(fid, end='\r')
        img, _ = medio.load(img_f)
        lab, _ = medio.load(lab_f)
        assert img.shape == lab.shape, f"* diff shape: {img.shape} v.s. {lab.shape}"
        cnt += 1
print("\ntotal:", cnt, len(os.listdir(label_p)))

# 5 [Cervix](https://www.synapse.org/#!Synapse:syn3378972)

Three zip files:
- Cervix.zip
- CervixRawData.zip
- CervixRegData.zip

Only use CervixRawData.zip. Unzip to *dataset5_cervix/RawData/*.

In [None]:
data_p = osp.join(P, "dataset5_cervix/RawData")
label_p = osp.join(P, "dataset5_mask_mappingback")
cnt = 0
for subset in os.listdir(data_p):
    print('\t', subset)
    subset_p = osp.join(data_p, subset, "img")
    for f in os.listdir(subset_p):
        # 0507688-Image.nii.gz, dataset5_0507688_Image_mask_4label.nii.gz
        fid = f[:-7].replace('-', '_')
        lab_f = osp.join(label_p, f"dataset5_{fid}_mask_4label.nii.gz")
        if osp.isfile(lab_f):
            print(f, end='\r')
            img, _ = medio.load(osp.join(subset_p, f))
            lab, _ = medio.load(lab_f)
            # print(img.shape, img.dtype, lab.shape, lab.dtype, np.unique(lab))
            assert img.shape == lab.shape, f"* diff shape: {img.shape} v.s. {lab.shape}"
            cnt += 1
print("\ntotal:", cnt, len(os.listdir(label_p)))

# 6 [CTPelvic1K_dataset6_data.tar.gz](https://zenodo.org/records/4588403#.YEyLq_0zaCo)
Download link from [MIRACLE-Center/CTPelvic1K](https://github.com/MIRACLE-Center/CTPelvic1K).
Unzip to *dataset6_clinic/*.

In [None]:
data_p = osp.join(P, "dataset6_clinic")
label_p = osp.join(P, "dataset6_mask_mappingback")
cnt = 0
for f in os.listdir(data_p):
    # dataset6_CLINIC_0001_data.nii.gz, dataset6_CLINIC_0001_mask_4label.nii.gz
    fid = f[:-12]
    lab_f = osp.join(label_p, f"{fid}_mask_4label.nii.gz")
    if osp.isfile(lab_f):
        print(f, end='\r')
        img, _ = medio.load(osp.join(data_p, f))
        lab, _ = medio.load(lab_f)
        # print(img.shape, img.dtype, lab.shape, lab.dtype, np.unique(lab))
        assert img.shape == lab.shape, f"* diff shape: {img.shape} v.s. {lab.shape}"
        cnt += 1
print("\ntotal:", cnt, len(os.listdir(label_p)))

# 7 [CTPelvic1K_dataset7_data.tar.gz](https://zenodo.org/records/4588403#.YEyLq_0zaCo)
Download link from [MIRACLE-Center/CTPelvic1K](https://github.com/MIRACLE-Center/CTPelvic1K).
Unzip to *dataset7_clinic_metal/*.

In [None]:
data_p = osp.join(P, "dataset7_clinic_metal")
label_p = osp.join(P, "dataset7_mask_mappingback")
cnt = 0
for f in os.listdir(data_p):
    # dataset7_CLINIC_metal_0000_data.nii.gz, CLINIC_metal_0000_mask_4label.nii.gz
    fid = f[9:-12]
    lab_f = osp.join(label_p, f"{fid}_mask_4label.nii.gz")
    if osp.isfile(lab_f):
        print(f, end='\r')
        img, _ = medio.load(osp.join(data_p, f))
        lab, _ = medio.load(lab_f)
        # print(img.shape, img.dtype, lab.shape, lab.dtype, np.unique(lab))
        assert img.shape == lab.shape, f"* diff shape: {img.shape} v.s. {lab.shape}"
        cnt += 1
print("\ntotal:", cnt, len(os.listdir(label_p)))