# Collect Information on Current Datasets

In [1]:
import os
import pydicom

In [2]:
def get_dicom_metadata(dicom_path):
    try:
        ds = pydicom.dcmread(dicom_path, stop_before_pixels=True)
        return {
            "File": dicom_path,
            "PatientID": getattr(ds, "PatientID", None),
            "StudyInstanceUID": getattr(ds, "StudyInstanceUID", None),
            "SeriesInstanceUID": getattr(ds, "SeriesInstanceUID", None),
            "SOPInstanceUID": getattr(ds, "SOPInstanceUID", None),
            "Modality": getattr(ds, "Modality", None),
            "ImagePositionPatient": getattr(ds, "ImagePositionPatient", None),
            "ImageOrientationPatient": getattr(ds, "ImageOrientationPatient", None),
            "PixelSpacing": getattr(ds, "PixelSpacing", None),
            "SliceThickness": getattr(ds, "SliceThickness", None),
            "RescaleSlope": getattr(ds, "RescaleSlope", None),
            "RescaleIntercept": getattr(ds, "RescaleIntercept", None),
            "Rows": getattr(ds, "Rows", None),
            "Columns": getattr(ds, "Columns", None),
            "SeriesDescription": getattr(ds, "SeriesDescription", None),
            "Manufacturer": getattr(ds, "Manufacturer", None),
            "KVP": getattr(ds, "KVP", None),
            "ConvolutionKernel": getattr(ds, "ConvolutionKernel", None),
        }
    except Exception as e:
        return {"File": dicom_path, "Error": str(e)}

In [3]:
def collect_all_dicom_files(base_dirs):
    dicom_files = []
    for base in base_dirs:
        for root, _, files in os.walk(base):
            for f in files:
                if (
                    f.lower().endswith(".dcm") or f.isdigit()
                ):  # DICOMs often have no extension
                    dicom_files.append(os.path.join(root, f))
    return dicom_files

In [4]:
base_dirs = [
    "/Users/joshuabunnell/Projects/data/dicom/ct-colonography_organized",
    "/Users/joshuabunnell/Projects/data/dicom/pediatric-ct-seg_organized",
]
dicom_files = collect_all_dicom_files(base_dirs)

In [5]:
import random
import pandas as pd

print(f"Found {len(dicom_files)} DICOM files.")

# Take 5 random samples from each base directory and collect metadata into separate DataFrames
dfs = {}
for base in base_dirs:
    files_in_base = [f for f in dicom_files if f.startswith(base)]
    if len(files_in_base) == 0:
        continue
    sample_files = random.sample(files_in_base, min(5, len(files_in_base)))
    sampled_metadata = []
    for meta in map(get_dicom_metadata, sample_files):
        # Truncate file path to last 3 components for clarity
        meta["File"] = (
            os.path.join(*meta["File"].split(os.sep)[-3:]) if "File" in meta else None
        )
        sampled_metadata.append(meta)
    df = pd.DataFrame(sampled_metadata)
    dfs[os.path.basename(base)] = df

for name, df in dfs.items():
    print(f"\nSampled metadata for dataset: {name}")
    display(df)

Found 13242 DICOM files.

Sampled metadata for dataset: ct-colonography_organized


Unnamed: 0,File,PatientID,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,Modality,ImagePositionPatient,ImageOrientationPatient,PixelSpacing,SliceThickness,RescaleSlope,RescaleIntercept,Rows,Columns,SeriesDescription,Manufacturer,KVP,ConvolutionKernel
0,1.3.6.1.4.1.9328.50.4.341506/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0305,1.3.6.1.4.1.9328.50.4.341506,1.3.6.1.4.1.9328.50.4.341515,1.3.6.1.4.1.9328.50.4.341644,CT,"[-248.53125, -414.53125, -228.9]","[1, 0, 0, 0, 1, 0]","[0.9375, 0.9375]",1.0,1.0,-1024.0,512,512,Colo_supine 1.0 B30f,SIEMENS,120.0,B30f
1,1.3.6.1.4.1.9328.50.4.1/1.3.6.1.4.1.9328.50.4....,1.3.6.1.4.1.9328.50.4.0001,1.3.6.1.4.1.9328.50.4.1,1.3.6.1.4.1.9328.50.4.2,1.3.6.1.4.1.9328.50.4.157,CT,"[-208.609375, -18.609375, -238.3]","[1, 0, 0, 0, 1, 0]","[0.78125, 0.78125]",1.0,1.0,-1024.0,512,512,Colo_prone 1.0 B30f,SIEMENS,120.0,B30f
2,1.3.6.1.4.1.9328.50.4.1240/1.3.6.1.4.1.9328.50...,1.3.6.1.4.1.9328.50.4.0002,1.3.6.1.4.1.9328.50.4.1240,1.3.6.1.4.1.9328.50.4.1840,1.3.6.1.4.1.9328.50.4.2517,CT,"[-211.609375, -389.609375, -508.9]","[1, 0, 0, 0, 1, 0]","[0.78125, 0.78125]",1.0,1.0,-1024.0,512,512,Colo_supine 1.0 B30f,SIEMENS,120.0,B30f
3,1.3.6.1.4.1.9328.50.81.87667144593503751221669...,CTC-1968343337,1.3.6.1.4.1.9328.50.81.87667144593503751221669...,1.3.6.1.4.1.9328.50.81.87253217214149181807842...,1.3.6.1.4.1.9328.50.81.10064940078828576701829...,CT,"[383.666015625, -179.666015625, -145.8]","[0, 1, 0, -1, 0, 0]","[0.66796875, 0.66796875]",1.0,1.0,-1024.0,512,512,,,120.0,B30f
4,1.3.6.1.4.1.9328.50.4.339223/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0303,1.3.6.1.4.1.9328.50.4.339223,1.3.6.1.4.1.9328.50.4.339748,1.3.6.1.4.1.9328.50.4.340202,CT,"[-129.73633, 278.73633, -438.4]","[1, 0, 0, 0, -1, 0]","[0.52734375, 0.52734375]",1.0,1.0,-1024.0,512,512,Prone colon 1.0 B30f,SIEMENS,120.0,B30f



Sampled metadata for dataset: pediatric-ct-seg_organized


Unnamed: 0,File,PatientID,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,Modality,ImagePositionPatient,ImageOrientationPatient,PixelSpacing,SliceThickness,RescaleSlope,RescaleIntercept,Rows,Columns,SeriesDescription,Manufacturer,KVP,ConvolutionKernel
0,1.3.6.1.4.1.14519.5.2.1.2583031495368705808827...,Pediatric-CT-SEG-394,1.3.6.1.4.1.14519.5.2.1.2583031495368705808827...,1.3.6.1.4.1.14519.5.2.1.3319039318585324098081...,1.3.6.1.4.1.14519.5.2.1.1746760277896235684094...,CT,"[-135.70703125, -271.70703125, -363.2]","[1, 0, 0, 0, 1, 0]","[0.5859375, 0.5859375]",2.0,1.0,-1000.0,512,512,CT,SIEMENS,100.0,I30f
1,1.3.6.1.4.1.14519.5.2.1.3377250684499427698990...,Pediatric-CT-SEG-397,1.3.6.1.4.1.14519.5.2.1.3377250684499427698990...,1.3.6.1.4.1.14519.5.2.1.3143136243027949157795...,1.3.6.1.4.1.14519.5.2.1.5653229184313099829479...,CT,"[-171.6875, -302.6875, -78.7]","[1, 0, 0, 0, 1, 0]","[0.625, 0.625]",2.0,1.0,-1000.0,512,512,CT,SIEMENS,100.0,I30f
2,1.3.6.1.4.1.14519.5.2.1.2583031495368705808827...,Pediatric-CT-SEG-394,1.3.6.1.4.1.14519.5.2.1.2583031495368705808827...,1.3.6.1.4.1.14519.5.2.1.3319039318585324098081...,1.3.6.1.4.1.14519.5.2.1.2703469485014379532668...,CT,"[-135.70703125, -271.70703125, -195.2]","[1, 0, 0, 0, 1, 0]","[0.5859375, 0.5859375]",2.0,1.0,-1000.0,512,512,CT,SIEMENS,100.0,I30f
3,1.3.6.1.4.1.14519.5.2.1.5919157327421464817677...,Pediatric-CT-SEG-FD1B2AC2,1.3.6.1.4.1.14519.5.2.1.5919157327421464817677...,1.3.6.1.4.1.14519.5.2.1.1521731863282576644426...,1.3.6.1.4.1.14519.5.2.1.7505369135900539670221...,CT,"[-202.9, -200, -159.4375]","[1, 0, 0, 0, 1, 0]","[0.78125, 0.78125]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,120.0,SOFT
4,1.3.6.1.4.1.14519.5.2.1.8859670961354389003042...,Pediatric-CT-SEG-00DCF4D6,1.3.6.1.4.1.14519.5.2.1.8859670961354389003042...,1.3.6.1.4.1.14519.5.2.1.6673411903393211051343...,1.3.6.1.4.1.14519.5.2.1.5183483660741018730723...,CT,"[-139.39, -150, -237.4495]","[1, 0, 0, 0, 1, 0]","[0.585938, 0.585938]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,100.0,STANDARD
