In [None]:
###Author: Saeed Arbabi 12/1/2019
### this code aims to
# DONE 1- give me an overview of all the DICOM files I have in a folder and it's subfolders organized in a CSV file
# DONE 2- relocate files and folders in the hierarchy like: project_name/modality/patient_num/acquisition_date/[numslices]_time_seq_origfoldernum/
# TODO 3- do some pre-processing making them ready for the next steps like:
    # 3.1- inhomogeniety correction
    # 3.2- normalization
    # 3.3- bias field correction
#pass your main DICOM data folder to the program as a command line argument

import glob, os, sys, shutil
import SimpleITK as sitk
import numpy as np

PRJ_NAME = "TOFAPredict-20210319"
DATA_DIR = "D:/TOFA/data_base/sarbabi-20210319_105957"

In [None]:
###step1: find all the dicom series in the data directory and subdirectories
 #"D:/archive/data/tofa-predict/data/sarbabi-20191004_115744" #sys.argv[1]
print(f"folders containing DICOM images in folder: {DATA_DIR}")
dicom_folders = {}
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(".dcm"):
            if(root not in dicom_folders):
                print(root)
                dicom_folders[root]=1
            else:
                dicom_folders[root]+=1

In [None]:
###step2: read each dicom serie and extract info about it from dicom tags and save in CSV file
tags_to_copy = ["0010|0010",  # Patient Name
                "0008|0020",  # Study Date
                "0008|0030",  # Study Time
                "0008|0060",  # Modality
                "0018|0015",  # Body Part Examined
                "0008|103e",  # Series Description
                "0008|0008", # Image Type
                "0018|0086", # Echo Number
                "0020|000e", # Series Instance UID
                "0028|0030", # Pixel Spacing
                "0018|0088", # Spacing Between Slices
                "0018|0050", # Slice Thickness
                "0028|0010", # Rows
                "0028|0011", # Columns
                "0018|1314", # Flip Angle
                "0018|0081", # Echo Time
                "0018|0080" # Repetition Time
                ]
data_array = np.empty((0, 20), str)
data_array = np.append(data_array, np.array([["patientId", "studyDate", "studyTime","modality", "bodyPart", "seriesDescription", "imageType", "echoNumber",
                                              "seriesInstanceUID", "pixelSpacingX", "pixelSpacingY", "spacingBetweenSlices", "sliceThickness", "rows", "columns",
                                              "flipAngle", "echoTime", "repetitionTime", "numSlices", "dicomFolder"]]), axis=0)
dfnum =0
for dicom_folder, num_slices in dicom_folders.items():
    reader = sitk.ImageFileReader()

    reader.SetFileName(glob.glob(f"{dicom_folder}/*.dcm")[0])
    reader.LoadPrivateTagsOn();

    # series_IDs = sitk.ImageSeriesReader.GetGDCMSeriesIDs(dicom_folder)
    try:
        reader.ReadImageInformation();
    except:
        continue

    existing_keys = reader.GetMetaDataKeys()
    tags_array = np.empty(0, str)
#####TODO: in the process, generate spacing between slices according to orientation and position
    for k in tags_to_copy:
        if(k in existing_keys):
            #print(existing_keys)
            v = reader.GetMetaData(k)
            if k == "0028|0030": #pixrl spacing should be splitted to x and y
                v1 = v.split("\\")[0]
                tags_array = np.append(tags_array, v1)
                v2 = v.split("\\")[1]
                tags_array = np.append(tags_array, v2)
            # elif k=="0020,000E": #read series instance ID from sitk folder
            #     v = series_IDs[0]
            #     tags_array = np.append(tags_array, v)

            elif k == "0018|0088" and tags_array[3] == "CT" and num_slices > 1:
                reader2 = sitk.ImageFileReader()

                reader2.SetFileName(glob.glob(f"{dicom_folder}/*.dcm")[1])
                reader2.LoadPrivateTagsOn();

                # series_IDs = sitk.ImageSeriesReader.GetGDCMSeriesIDs(dicom_folder)
                try:
                    reader2.ReadImageInformation();
                except:
                    continue

                slice0Pos = float(reader.GetMetaData("0020|0032").split("\\")[2])
                slice1Pos = float(reader2.GetMetaData("0020|0032").split("\\")[2])
                v = slice1Pos - slice0Pos
                tags_array = np.append(tags_array, str(v))
            else:
                tags_array = np.append(tags_array, v)
        else:
            if k == "0028|0030": #pixrl spacing should be splitted to x and y
                v1 = "-"
                tags_array = np.append(tags_array, v1)
                v2 = "-"
                tags_array = np.append(tags_array, v2)
            # elif k == "0020,000E":  # read series instance ID from sitk folder
            #     v = series_IDs[0]
            #     tags_array = np.append(tags_array, v)

            elif k == "0018|0088" and tags_array[3] == "CT" and num_slices > 1:
                reader2 = sitk.ImageFileReader()

                reader2.SetFileName(glob.glob(f"{dicom_folder}/*.dcm")[1])
                reader2.LoadPrivateTagsOn();

                # series_IDs = sitk.ImageSeriesReader.GetGDCMSeriesIDs(dicom_folder)
                try:
                    reader2.ReadImageInformation();
                except:
                    continue

                slice0tags = reader.GetMetaDataKeys()
                slice1tags = reader.GetMetaDataKeys()
                if "0020|0032" in slice0tags and "0020|0032" in slice1tags:
                    a = reader.GetMetaData("0020|0032")
                    slice0Pos = float(reader.GetMetaData("0020|0032").split("\\")[2])
                    slice1Pos = float(reader2.GetMetaData("0020|0032").split("\\")[2])
                    v = np.linalg.norm(slice1Pos - slice0Pos)
                    tags_array = np.append(tags_array, str(v))


                elif "0020|1041" in slice0tags and "0020|1041" in slice1tags:
                    slice0loc = float(reader.GetMetaData("0020|1041").split("\\")[2])
                    slice1loc = float(reader2.GetMetaData("0020|1041").split("\\")[2])
                    v = np.abs(slice1loc - slice0loc)
                    tags_array = np.append(tags_array, str(v))

                else:
                    v="-"
                    tags_array = np.append(tags_array, v)


            else:
                v = "-"
                tags_array = np.append(tags_array, v)

    tags_array = np.append(tags_array, num_slices)
    tags_array = np.append(tags_array, dicom_folder)

    data_array = np.append(data_array, np.array([tags_array]), axis=0)

    ###relocates data to new folders accordingly
    #print(f"dicom folder: {dicom_folder}")
    destination_folder = f"{DATA_DIR if DATA_DIR.endswith('/') else DATA_DIR+'/' }{PRJ_NAME}\\{tags_array[3]}\\{tags_array[0]}\\{tags_array[1]}\\[{num_slices}]_{tags_array[2]}_{tags_array[5].replace('/', '')}_{tags_array[4]}_{'_'.join(dicom_folder.split(os.path.sep)[-2:])}_{dfnum}\\"
    shutil.copytree(dicom_folder, destination_folder)
    print(f"copied folder {dicom_folder} to folder: {destination_folder}")
    dfnum+=1
    print(f"{dfnum*100/len(dicom_folders)} percent Done!")
    

In [None]:
np.savetxt(f"{DATA_DIR if DATA_DIR.endswith('/') else DATA_DIR+'/' }TOFAPredict-data_inventory.csv", data_array, delimiter=',', fmt='%s')