# Collect Information on Current Datasets

In [7]:
import os
import pydicom

In [8]:
def get_dicom_metadata(dicom_path):
    try:
        ds = pydicom.dcmread(dicom_path, stop_before_pixels=True)
        return {
            "File": dicom_path,
            "PatientID": getattr(ds, "PatientID", None),
            "StudyInstanceUID": getattr(ds, "StudyInstanceUID", None),
            "SeriesInstanceUID": getattr(ds, "SeriesInstanceUID", None),
            "SOPInstanceUID": getattr(ds, "SOPInstanceUID", None),
            "Modality": getattr(ds, "Modality", None),
            "ImagePositionPatient": getattr(ds, "ImagePositionPatient", None),
            "ImageOrientationPatient": getattr(ds, "ImageOrientationPatient", None),
            "PixelSpacing": getattr(ds, "PixelSpacing", None),
            "SliceThickness": getattr(ds, "SliceThickness", None),
            "RescaleSlope": getattr(ds, "RescaleSlope", None),
            "RescaleIntercept": getattr(ds, "RescaleIntercept", None),
            "Rows": getattr(ds, "Rows", None),
            "Columns": getattr(ds, "Columns", None),
            "SeriesDescription": getattr(ds, "SeriesDescription", None),
            "Manufacturer": getattr(ds, "Manufacturer", None),
            "KVP": getattr(ds, "KVP", None),
            "ConvolutionKernel": getattr(ds, "ConvolutionKernel", None),
        }
    except Exception as e:
        return {"File": dicom_path, "Error": str(e)}

In [9]:
def collect_all_dicom_files(base_dirs):
    dicom_files = []
    for base in base_dirs:
        for root, _, files in os.walk(base):
            for f in files:
                if (
                    f.lower().endswith(".dcm") or f.isdigit()
                ):  # DICOMs often have no extension
                    dicom_files.append(os.path.join(root, f))
    return dicom_files

In [10]:
base_dirs = [
    "/Users/joshuabunnell/Projects/data/dicom/ct-colonography_organized",
    "/Users/joshuabunnell/Projects/data/dicom/pediatric-ct-seg_organized",
]
dicom_files = collect_all_dicom_files(base_dirs)

In [11]:
import random
import pandas as pd

print(f"Found {len(dicom_files)} DICOM files.")

# Take 5 random samples from each base directory and collect metadata into separate DataFrames
dfs = {}
for base in base_dirs:
    files_in_base = [f for f in dicom_files if f.startswith(base)]
    if len(files_in_base) == 0:
        continue
    sample_files = random.sample(files_in_base, min(5, len(files_in_base)))
    sampled_metadata = []
    for meta in map(get_dicom_metadata, sample_files):
        # Truncate file path to last 3 components for clarity
        meta["File"] = (
            os.path.join(*meta["File"].split(os.sep)[-3:]) if "File" in meta else None
        )
        sampled_metadata.append(meta)
    df = pd.DataFrame(sampled_metadata)
    dfs[os.path.basename(base)] = df

for name, df in dfs.items():
    print(f"\nSampled metadata for dataset: {name}")
    display(df)

Found 13242 DICOM files.

Sampled metadata for dataset: ct-colonography_organized


Unnamed: 0,File,PatientID,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,Modality,ImagePositionPatient,ImageOrientationPatient,PixelSpacing,SliceThickness,RescaleSlope,RescaleIntercept,Rows,Columns,SeriesDescription,Manufacturer,KVP,ConvolutionKernel
0,1.3.6.1.4.1.9328.50.4.773786/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0679,1.3.6.1.4.1.9328.50.4.773786,1.3.6.1.4.1.9328.50.4.773791,1.3.6.1.4.1.9328.50.4.774315,CT,"[207.100006, 185.000000, -60.540001]","[-1.000000, 0.000000, 0.000000, 0.000000, -1.0...","[0.722656, 0.722656]",1.25,1.0,-1024.0,512,512,Recon 2: ABD PEL WITHOUT - PRON,GE MEDICAL SYSTEMS,120.0,STANDARD
1,1.3.6.1.4.1.9328.50.4.773786/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0679,1.3.6.1.4.1.9328.50.4.773786,1.3.6.1.4.1.9328.50.4.774331,1.3.6.1.4.1.9328.50.4.774451,CT,"[-185.000000, -185.000000, -160.889999]","[1.000000, 0.000000, 0.000000, 0.000000, 1.000...","[0.722656, 0.722656]",1.25,1.0,-1024.0,512,512,Recon 2: ABD PEL WITHOUT - SUPI,GE MEDICAL SYSTEMS,120.0,STANDARD
2,1.3.6.1.4.1.9328.50.4.773786/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0679,1.3.6.1.4.1.9328.50.4.773786,1.3.6.1.4.1.9328.50.4.774331,1.3.6.1.4.1.9328.50.4.774726,CT,"[-185.000000, -185.000000, -359.290009]","[1.000000, 0.000000, 0.000000, 0.000000, 1.000...","[0.722656, 0.722656]",1.25,1.0,-1024.0,512,512,Recon 2: ABD PEL WITHOUT - SUPI,GE MEDICAL SYSTEMS,120.0,STANDARD
3,1.3.6.1.4.1.9328.50.4.339223/1.3.6.1.4.1.9328....,1.3.6.1.4.1.9328.50.4.0303,1.3.6.1.4.1.9328.50.4.339223,1.3.6.1.4.1.9328.50.4.339227,1.3.6.1.4.1.9328.50.4.339268,CT,"[-123.73633, -278.73633, -111]","[1, 0, 0, 0, 1, 0]","[0.52734375, 0.52734375]",1.0,1.0,-1024.0,512,512,Supine colon 1.0 B30f,SIEMENS,120.0,B30f
4,1.3.6.1.4.1.9328.50.81.87667144593503751221669...,CTC-1968343337,1.3.6.1.4.1.9328.50.81.87667144593503751221669...,1.3.6.1.4.1.9328.50.81.31641643207997362716333...,1.3.6.1.4.1.9328.50.81.17673619708685686812973...,CT,"[-180.1298828125, 402.1298828125, -407.9]","[1, 0, 0, 0, -1, 0]","[0.740234375, 0.740234375]",1.0,1.0,-1024.0,512,512,,,120.0,B30f



Sampled metadata for dataset: pediatric-ct-seg_organized


Unnamed: 0,File,PatientID,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,Modality,ImagePositionPatient,ImageOrientationPatient,PixelSpacing,SliceThickness,RescaleSlope,RescaleIntercept,Rows,Columns,SeriesDescription,Manufacturer,KVP,ConvolutionKernel
0,1.3.6.1.4.1.14519.5.2.1.3377250684499427698990...,Pediatric-CT-SEG-397,1.3.6.1.4.1.14519.5.2.1.3377250684499427698990...,1.3.6.1.4.1.14519.5.2.1.3143136243027949157795...,1.3.6.1.4.1.14519.5.2.1.1535428234208205111742...,CT,"[-171.6875, -302.6875, -268.7]","[1, 0, 0, 0, 1, 0]","[0.625, 0.625]",2.0,1.0,-1000.0,512,512,CT,SIEMENS,100.0,I30f
1,1.3.6.1.4.1.14519.5.2.1.5287192022683355187019...,Pediatric-CT-SEG-BC4D2ECE,1.3.6.1.4.1.14519.5.2.1.5287192022683355187019...,1.3.6.1.4.1.14519.5.2.1.1524623105381356432963...,1.3.6.1.4.1.14519.5.2.1.2489573939171919271422...,CT,"[-144.38, -130, -312.865]","[1, 0, 0, 0, 1, 0]","[0.507813, 0.507813]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,100.0,STANDARD
2,1.3.6.1.4.1.14519.5.2.1.8859670961354389003042...,Pediatric-CT-SEG-00DCF4D6,1.3.6.1.4.1.14519.5.2.1.8859670961354389003042...,1.3.6.1.4.1.14519.5.2.1.6673411903393211051343...,1.3.6.1.4.1.14519.5.2.1.3677209599855741853039...,CT,"[-139.39, -150, -35.4495]","[1, 0, 0, 0, 1, 0]","[0.585938, 0.585938]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,100.0,STANDARD
3,1.3.6.1.4.1.14519.5.2.1.8859670961354389003042...,Pediatric-CT-SEG-00DCF4D6,1.3.6.1.4.1.14519.5.2.1.8859670961354389003042...,1.3.6.1.4.1.14519.5.2.1.6673411903393211051343...,1.3.6.1.4.1.14519.5.2.1.5720088764502167539321...,CT,"[-139.39, -150, -207.4495]","[1, 0, 0, 0, 1, 0]","[0.585938, 0.585938]",2.0,1.0,-1000.0,512,512,CT,GE MEDICAL SYSTEMS,100.0,STANDARD
4,1.3.6.1.4.1.14519.5.2.1.7712982083719829568726...,Pediatric-CT-SEG-018B687C,1.3.6.1.4.1.14519.5.2.1.7712982083719829568726...,1.3.6.1.4.1.14519.5.2.1.1575047608341984692948...,1.3.6.1.4.1.14519.5.2.1.3171760166993391597338...,CT,"[-100.78515625, -209.78515625, -273.2]","[1, 0, 0, 0, 1, 0]","[0.4296875, 0.4296875]",2.0,1.0,-1000.0,512,512,CT,SIEMENS,100.0,I30f


In [15]:
import pydicom

# Replace with the path to that single file
single_file_path = "/Users/joshuabunnell/Projects/data/dicom/pediatric-ct-seg_organized/1.3.6.1.4.1.14519.5.2.1.52871920226833551870199279497603326842/1.3.6.1.4.1.14519.5.2.1.85241121033233055628533208493701153817/1.3.6.1.4.1.14519.5.2.1.156225914409054890865164949571079791260.dcm"
ds = pydicom.dcmread(single_file_path)
print(f"Modality: {ds.Modality}")
print(f"Description: {ds.SeriesDescription}")

Modality: RTSTRUCT
Description: RTSTRUCT
