# Collect Information on Current Datasets

In [1]:
import os
import pydicom

In [2]:
def get_dicom_metadata(dicom_path):
    try:
        ds = pydicom.dcmread(dicom_path, stop_before_pixels=True)
        return {
            "File": dicom_path,
            "PatientID": getattr(ds, "PatientID", None),
            "StudyInstanceUID": getattr(ds, "StudyInstanceUID", None),
            "SeriesInstanceUID": getattr(ds, "SeriesInstanceUID", None),
            "SOPInstanceUID": getattr(ds, "SOPInstanceUID", None),
            "Modality": getattr(ds, "Modality", None),
            "ImagePositionPatient": getattr(ds, "ImagePositionPatient", None),
            "ImageOrientationPatient": getattr(ds, "ImageOrientationPatient", None),
            "PixelSpacing": getattr(ds, "PixelSpacing", None),
            "SliceThickness": getattr(ds, "SliceThickness", None),
            "RescaleSlope": getattr(ds, "RescaleSlope", None),
            "RescaleIntercept": getattr(ds, "RescaleIntercept", None),
            "Rows": getattr(ds, "Rows", None),
            "Columns": getattr(ds, "Columns", None),
            "SeriesDescription": getattr(ds, "SeriesDescription", None),
            "Manufacturer": getattr(ds, "Manufacturer", None),
            "KVP": getattr(ds, "KVP", None),
            "ConvolutionKernel": getattr(ds, "ConvolutionKernel", None),
        }
    except Exception as e:
        return {"File": dicom_path, "Error": str(e)}

In [3]:
def collect_all_dicom_files(base_dirs):
    dicom_files = []
    for base in base_dirs:
        for root, _, files in os.walk(base):
            for f in files:
                if (
                    f.lower().endswith(".dcm") or f.isdigit()
                ):  # DICOMs often have no extension
                    dicom_files.append(os.path.join(root, f))
    return dicom_files

In [4]:
base_dirs = [
    "/Users/joshuabunnell/Projects/data/dicom/ct-colonography_organized",
    "/Users/joshuabunnell/Projects/data/dicom/pediatric-ct-seg_organized",
]
dicom_files = collect_all_dicom_files(base_dirs)

In [5]:
import random
import pandas as pd

print(f"Found {len(dicom_files)} DICOM files.")

# Take 5 random samples from each base directory and collect metadata into separate DataFrames
dfs = {}
for base in base_dirs:
    files_in_base = [f for f in dicom_files if f.startswith(base)]
    if len(files_in_base) == 0:
        continue
    sample_files = random.sample(files_in_base, min(5, len(files_in_base)))
    sampled_metadata = []
    for meta in map(get_dicom_metadata, sample_files):
        # Truncate file path to last 3 components for clarity
        meta["File"] = (
            os.path.join(*meta["File"].split(os.sep)[-3:]) if "File" in meta else None
        )
        sampled_metadata.append(meta)
    df = pd.DataFrame(sampled_metadata)
    dfs[os.path.basename(base)] = df

for name, df in dfs.items():
    print(f"\nSampled metadata for dataset: {name}")
    display(df)

Found 13242 DICOM files.

Sampled metadata for dataset: ct-colonography_organized


Unnamed: 0,File,PatientID,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,Modality,ImagePositionPatient,ImageOrientationPatient,PixelSpacing,SliceThickness,RescaleSlope,RescaleIntercept,Rows,Columns,SeriesDescription,Manufacturer,KVP,ConvolutionKernel
0,1.3.6.1.4.1.9328.50.4.1/1.3.6.1.4.1.9328.50.4....,1.3.6.1.4.1.9328.50.4.0001,1.3.6.1.4.1.9328.50.4.1,1.3.6.1.4.1.9328.50.4.2,1.3.6.1.4.1.9328.50.4.514,CT,"[-208.609375, -18.609375, -354.3]","[1, 0, 0, 0, 1, 0]","[0.78125, 0.78125]",1.0,1.0,-1024.0,512,512,Colo_prone 1.0 B30f,SIEMENS,120.0,B30f
1,1.3.6.1.4.1.9328.50.4.1240/1.3.6.1.4.1.9328.50...,1.3.6.1.4.1.9328.50.4.0002,1.3.6.1.4.1.9328.50.4.1240,1.3.6.1.4.1.9328.50.4.1241,1.3.6.1.4.1.9328.50.4.1388,CT,"[-200.609375, -34.609375, -157.5]","[1, 0, 0, 0, 1, 0]","[0.78125, 0.78125]",1.0,1.0,-1024.0,512,512,Colo_prone 1.0 B30f,SIEMENS,120.0,B30f
2,1.3.6.1.4.1.9328.50.4.341506/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0305,1.3.6.1.4.1.9328.50.4.341506,1.3.6.1.4.1.9328.50.4.341515,1.3.6.1.4.1.9328.50.4.341872,CT,"[-248.53125, -414.53125, -411.3]","[1, 0, 0, 0, 1, 0]","[0.9375, 0.9375]",1.0,1.0,-1024.0,512,512,Colo_supine 1.0 B30f,SIEMENS,120.0,B30f
3,1.3.6.1.4.1.9328.50.4.1240/1.3.6.1.4.1.9328.50...,1.3.6.1.4.1.9328.50.4.0002,1.3.6.1.4.1.9328.50.4.1240,1.3.6.1.4.1.9328.50.4.1241,1.3.6.1.4.1.9328.50.4.1550,CT,"[-200.609375, -34.609375, -287.1]","[1, 0, 0, 0, 1, 0]","[0.78125, 0.78125]",1.0,1.0,-1024.0,512,512,Colo_prone 1.0 B30f,SIEMENS,120.0,B30f
4,1.3.6.1.4.1.9328.50.4.339223/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0303,1.3.6.1.4.1.9328.50.4.339223,1.3.6.1.4.1.9328.50.4.339227,1.3.6.1.4.1.9328.50.4.339405,CT,"[-123.73633, -278.73633, -220.6]","[1, 0, 0, 0, 1, 0]","[0.52734375, 0.52734375]",1.0,1.0,-1024.0,512,512,Supine colon 1.0 B30f,SIEMENS,120.0,B30f



Sampled metadata for dataset: pediatric-ct-seg_organized


Unnamed: 0,File,PatientID,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,Modality,ImagePositionPatient,ImageOrientationPatient,PixelSpacing,SliceThickness,RescaleSlope,RescaleIntercept,Rows,Columns,SeriesDescription,Manufacturer,KVP,ConvolutionKernel
0,1.3.6.1.4.1.14519.5.2.1.2583031495368705808827...,Pediatric-CT-SEG-394,1.3.6.1.4.1.14519.5.2.1.2583031495368705808827...,1.3.6.1.4.1.14519.5.2.1.3319039318585324098081...,1.3.6.1.4.1.14519.5.2.1.1216882943656312266810...,CT,"[-135.70703125, -271.70703125, -429.2]","[1, 0, 0, 0, 1, 0]","[0.5859375, 0.5859375]",2.0,1.0,-1000.0,512,512,CT,SIEMENS,100.0,I30f
1,1.3.6.1.4.1.14519.5.2.1.5287192022683355187019...,Pediatric-CT-SEG-BC4D2ECE,1.3.6.1.4.1.14519.5.2.1.5287192022683355187019...,1.3.6.1.4.1.14519.5.2.1.1524623105381356432963...,1.3.6.1.4.1.14519.5.2.1.2639074103164458347481...,CT,"[-144.38, -130, -182.865]","[1, 0, 0, 0, 1, 0]","[0.507813, 0.507813]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,100.0,STANDARD
2,1.3.6.1.4.1.14519.5.2.1.5287192022683355187019...,Pediatric-CT-SEG-BC4D2ECE,1.3.6.1.4.1.14519.5.2.1.5287192022683355187019...,1.3.6.1.4.1.14519.5.2.1.1524623105381356432963...,1.3.6.1.4.1.14519.5.2.1.1314511263585050998612...,CT,"[-144.38, -130, -172.865]","[1, 0, 0, 0, 1, 0]","[0.507813, 0.507813]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,100.0,STANDARD
3,1.3.6.1.4.1.14519.5.2.1.7712982083719829568726...,Pediatric-CT-SEG-018B687C,1.3.6.1.4.1.14519.5.2.1.7712982083719829568726...,1.3.6.1.4.1.14519.5.2.1.1575047608341984692948...,1.3.6.1.4.1.14519.5.2.1.6435231897729957590120...,CT,"[-100.78515625, -209.78515625, -333.2]","[1, 0, 0, 0, 1, 0]","[0.4296875, 0.4296875]",2.0,1.0,-1000.0,512,512,CT,SIEMENS,100.0,I30f
4,1.3.6.1.4.1.14519.5.2.1.5919157327421464817677...,Pediatric-CT-SEG-FD1B2AC2,1.3.6.1.4.1.14519.5.2.1.5919157327421464817677...,1.3.6.1.4.1.14519.5.2.1.1521731863282576644426...,1.3.6.1.4.1.14519.5.2.1.2304779719404174710914...,CT,"[-202.9, -200, -129.4375]","[1, 0, 0, 0, 1, 0]","[0.78125, 0.78125]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,120.0,SOFT


In [6]:
import pydicom

# Replace with the path to that single file
single_file_path = "/Users/joshuabunnell/Projects/data/dicom/pediatric-ct-seg_organized/1.3.6.1.4.1.14519.5.2.1.52871920226833551870199279497603326842/1.3.6.1.4.1.14519.5.2.1.85241121033233055628533208493701153817/1.3.6.1.4.1.14519.5.2.1.156225914409054890865164949571079791260.dcm"
ds = pydicom.dcmread(single_file_path)
print(f"Modality: {ds.Modality}")
print(f"Description: {ds.SeriesDescription}")

Modality: RTSTRUCT
Description: RTSTRUCT
