REMBRANDT Preprocessing

In [None]:
import os
import random
import torch
import pydicom
from PIL import Image
from torchvision import transforms

#Resizes, normalizes, and converts images to tensors.
TransformImagesREM = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
#Finds all DICOM files from REMBRANDT filepath, setting file count limits and shuffling as well.
def Collect_Dicoms(path, max_files=None, random_shuffle=True):
    DCMFiles = [
        os.path.join(root, file)
        for root, _, files in os.walk(path)
        for file in files if file.endswith('.dcm')]
    if random_shuffle:
        random.shuffle(DCMFiles)
    return DCMFiles[:max_files] if max_files else DCMFiles

#Converts DICOM files to tensors and sets patient IDs.
def Convert_Dicoms(filepath):
    DCM = pydicom.dcmread(filepath)
    ConvertDCM = Image.fromarray(DCM.pixel_array).convert("RGB")
    TensorImage = TransformImagesREM(ConvertDCM)
    PathID = str(DCM.PatientID).strip()
    return TensorImage, PathID

#Sets batch sizes and the number of batches to download at a time, then process batches while also accounting for processing errors.
def Process_Batches(path, output_path, batch_size=1000, max_batches=10, random_shuffle=True):
    os.makedirs(output_path, exist_ok=True)
    BatchLimit = batch_size * max_batches if max_batches else None
    DicomPaths = Collect_Dicoms(path, max_files=BatchLimit, random_shuffle=random_shuffle)
    Total, TotalErrors = 0, 0
#Append batches to FinalTensors and FinalIDs, converting each image in the batch and checking for errors.
    for BatchIndex, i in enumerate(range(0, len(DicomPaths), batch_size), start=1):
        BatchPaths = DicomPaths[i:i+batch_size]
        FinalTensors = []
        FinalIDs = []
        print(f"Processing batch {BatchIndex} ({len(BatchPaths)} files)")
        for path in BatchPaths:
            try:
                TensorImage, PathID = Convert_Dicoms(path)
                FinalTensors.append(TensorImage)
                FinalIDs.append(PathID)
            except Exception as e:
                TotalErrors += 1
                with open("error_log.txt", "a", encoding="utf-8") as w:
                    w.write(f"{path} - {str(e)}\n")
#Save the batch tensors to the matching patient IDs.
        if FinalTensors:
            torch.save(torch.stack(FinalTensors), os.path.join(output_path, f"batch_{BatchIndex}.pt"))
            torch.save(FinalIDs, os.path.join(output_path, f"batch_{BatchIndex}_ids.pt"))
            Total += len(FinalTensors)
    print(f"Done. Saved {Total} tensors in {BatchIndex} batches. Total Errors: {TotalErrors}")
    
#Provide dataset paths and run preprocessing.
REMDirectory = r"C:\Users\hgood\OneDrive\REMBRANDT"
ProcessedTensorDir = r"C:\Users\hgood\OneDrive\Processed_REMBRANDT"
Process_Batches(REMDirectory, ProcessedTensorDir, batch_size=1000)