# Anonymize all DICOM files using dicognito

In [51]:
from pathlib import Path
from datetime import date
import pydicom
import dicognito.anonymizer
import os
from fnmatch import fnmatch
import time
from IPython.display import clear_output
import re

anonymizer = dicognito.anonymizer.Anonymizer()

Let's start by getting a master list of all files in the AHS network research folder

In [31]:
def getListOfFiles(dirName):
    """For the given path, get the List of all files in the directory tree """
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    global glob_counter
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
            glob_counter = glob_counter + 1
            if glob_counter > 100:
                clear_output(wait=True)
            print('File:', fullPath, 'Total: ', glob_counter, end='\r')
            
                
    return allFiles

#Data Folder
rootFolder = 'L:/SRS MRI Study/RAW_DCM/'
outputFolder = 'C:/Users/josephmadamesila/Documents/Projects/mlmet/0_data/ANONYMIZED/'

#Get a list of all patient numbers
arrPx = sorted(os.listdir(rootFolder))    
arrPx = [rootFolder + px + "/" for px in arrPx]

#Get a list of all the files
files = []
start = time.time()
glob_counter = 0
for px in arrPx:
    temp_files = getListOfFiles(px)
    files.append(temp_files)
end = time.time()

1744.147943496704Study/RAW_DCM/Patient 16/MRI Images\2020-07-17\MR.1.2.840.113619.2.374.10504220.9724250.23421.1595008270.285.dcm Total:  44588


In [46]:
output_file = '../2_pipeline/00_anonymizer/out/TO_ANONYMIZE.txt'
with open(output_file, 'w') as filehandle:
    for px in files:
        for listitem in px:
            if listitem[-4:] == ".dcm":
                filehandle.write('%s\n' % listitem)
            else:
                print(listitem)

L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\LCer2Met1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\LCer2Met1903.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\LCerMet1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\LOccMet1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\MBrsMet1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\RCer2Met1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\RCerMet1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\RFrt2Met1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\RFrtMet1807.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\RFrtMet1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 02/Met Screenshots\RTmpMet1811.PNG
L:/SRS MRI Study/RAW_DCM/Patient 05/Met Screenshots\RFrt1Met1801.PNG
L:/SRS MRI Study/RAW_DCM/Patient 05/Met Screenshots\RFrt2Met1801.PNG
L:/SRS MRI Study/RAW_DCM/Patient 05/Thumbs.db


In [42]:
#Runtime
print(f"Searching took {round((end-start)/60,2)} minutes!")

Searching took 29.07 minutes!


In [47]:
#Read a anonymization file into a new list
anonymizer_file = '../2_pipeline/00_anonymizer/out/TO_ANONYMIZE.txt'

#define an empty list
dcms = []

#open file and read the content in a list
with open(anonymizer_file, 'r') as filehandle:
    for line in filehandle:
        # remove linebreak which is the last character of the string
        currentPlace = line[:-1]

        # add item to the list
        dcms.append(currentPlace)

Now that we have a master list of files that need to be anonymized let's begin bulk anonymization. I'm using dicognito to remove all patient identifiers. Notes:
- The acquisition date remains.
- Patient names are removed and replaced with anonymized patient number.

In [265]:
#Data Folder
outputFolder = '../0_data/ANONYMIZED/'

#Read a dcm file path, extract patient number, anonymize, and save in the new ANONYMIZED folder.
i = 0
bad_series = []
mr_series = []

for dirty in dcms:
    #Save some attributes
    path = dirty.replace('\\', '/').split("/")
    patient_name = path[path.index("RAW_DCM")+1]
    patient_num = patient_name.split(" ")[1]
    #print(dirty, end='\r')
    with pydicom.dcmread(dirty) as ds:
        #save tags
        try:
            date = ds.AcquisitionDate
        except:
            date = ds.StudyDate
        birthday = ds.PatientBirthDate
        modality = ds.Modality
        
        #If MR, be more specific about which sequence
        if modality == "MR":
            mr_series.append(f"{dirty}\t{ds.SeriesDescription}")
            #if "ADC" in list(ds.ImageType) or "DIFFUSION" in list(ds.ImageType):
            if "ADC" in ds.SeriesDescription or "Apparent" in ds.SeriesDescription:
                modality = "ADC"
            elif "DWI" in ds.SeriesDescription:
                modality = "DWI"
            elif "T1" in ds.SeriesDescription:
                modality = "T1"
            elif "FSPGR" in ds.SeriesDescription:
                modality = "T1"
            elif "FLAIR" in ds.SeriesDescription:
                modatality = "FLAIR"
            elif "POST" in ds.SeriesDescription:
                modality = "T1" #Sometimes named Ax FSPGR 3D POST
            elif "RAGE" in ds.SeriesDescription:
                modality = "T1" #AX MPRAGE GAD
            else:
                modality = str(ds.SeriesDescription).replace(" ","")
                bad_series.append(f"{dirty}\t{modality}")
    
        try:
            slice_num = ds.InstanceNumber.original_string
        except:
            slice_num = str(i)

        #Anonymize
        anonymizer.anonymize(ds)

        #Reimport date and save anon. px name
        ds.PatientName = patient_name
        ds.PatientID = patient_num
        ds.AcquisitionDate = date
        ds.PatientBirthDate = birthday
        
        #Output Location 
        save_name = f"Anon_{patient_num}_{modality}_{date}_Slice_{slice_num.zfill(3)}.dcm"
        save_path = outputFolder + "/".join(path[path.index("RAW_DCM")+1:-1]) + "/" + save_name
        ds.save_as(save_path)
        
        #Print some helpful info
        i = i + 1
        if (i % 100) >= 99:
            clear_output(wait=True)
        print('File:', save_name, '\tTotal: ', i, end='\r')
    
    

File: Anon_16_T1_20200601_Slice_151.dcm 	Total:  1392

In [236]:
ds = pydicom.dcmread(dcms[1528]) #T1
#print(ds.ImageType)
ds = pydicom.dcmread(dcms[1480]) #ADC
#print(ds.ProtocolName)
ds = pydicom.dcmread(dcms[1754]) #?
#print(ds.ProtocolName)

if "ADC" in list(ds.ImageType) or "DIFFUSION" in list(ds.ImageType):
    print("ADC")
    
if "T1" in ds.ProtocolName:
    print("T1")
ds.SeriesDescription

'Ax FSPGR 3D POST'

In [266]:
#set([x.split("\t")[0].split("/")[-1].split("\\")[1] for x in bad_series])
#[x.split("\t")[0] for x in bad_series]
len(dcms)

44574

In [253]:
set([x.split("\t")[1] for x in mr_series])

{'3D T1 TRA 1MM C+',
 'AX DWI NEW 2019_ADC',
 'AX DWI NEW_ADC',
 'AX DWI RESOLVE_ADC',
 'AX DWI_ADC',
 'AX DWI_TRACEW',
 'AX FLAIR POST',
 'AX FSPGR 3D POST',
 'AX FSPGR BRAVO',
 'AX MPRAGE',
 'AX MPRAGE GAD',
 'AX RAGE POST',
 'AX T1 FS VIBE POST',
 'AX T1 MPRAGE POST',
 'AX T1 RAGE POST',
 'AXIAL DWI RESOLVE_ADC',
 'Apparent Diffusion Coefficient (mm2/s)',
 'Ax DWI TETRA 500/1000',
 'Ax DWI: MULTI-BVALUE',
 'Ax FSPGR 3D POST',
 'Ax T2 FLAIR 3MM POST',
 'Axial DWI_ADC',
 'Axial T1-RAGE',
 'DIFFUSION_ADC',
 'DWI TRA_ADC',
 'DWI_ADC',
 'FSPGR 3D POST',
 'PG AX FSPGR 3D',
 'PG Axial T1-RAGE',
 'POST GAD AX FLAIR',
 'POST GAD AX FSPGR',
 'POST GAD AX FSPGR 3D',
 'POST SAG T1 FS SPACE_MPR_Axial MPR',
 'POST SAG T1 VIBE FS CAIPI'}

In [23]:
#Data Folder
outputFolder = '../0_data/ANONYMIZED/'

    
    
dirty = "Patient 01/CT Images/BreastApr2017/CT.C259470.Image 0.dcm"

with pydicom.dcmread(input_folder + dirty) as dataset:
    #save date
    date = dataset.AcquisitionDate
    
    #Anonymize
    anonymizer.anonymize(dataset)
    
    #Reimport date and save anon. px name
    dataset.PatientName = "Patient 01"
    dataset.PatientID = "01"
    dataset.AcquisitionDate = date
    
    dataset.save_as(output_folder + dirty)

In [26]:
root = 'L:/SRS MRI Study/RAW_DCM/'
pattern = "*.dcm"

for path, subdirs, files in os.walk(root):
    for name in files:
        if fnmatch(name, pattern):
            print(os.path.join(path, name))

L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 0.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 1.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 10.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 100.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 101.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 102.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 103.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 104.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 105.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 106.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470.Image 107.dcm
L:/SRS MRI Study/RAW_DCM/Patient 01\CT Images\BreastApr2017\CT.C259470

KeyboardInterrupt: 