In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydicom
import os
import hashlib
import uuid
import shutil
from tqdm import tqdm

In [110]:
def extract_dicom(dcm):
    data_dict = {}

    img = dcm.pixel_array

    try:
        patient_id = dcm.PatientID
    except:
        patient_id = None
    data_dict['patient_hash'] = hashlib.md5(patient_id.encode()).hexdigest()

    try:
        laterality = dcm.ImageLaterality
    except:
        laterality = None
    data_dict['laterality'] = laterality

    try:
        patient_age = int((pydicom.valuerep.DT(dcm.AcquisitionDateTime) - pydicom.valuerep.DT(dcm.PatientBirthDate)).days//365.25)
    except:
        patient_age = None
    data_dict['patient_age'] = patient_age
    
    try:
        frame_of_reference_uid = dcm.FrameOfReferenceUID
    except:
        frame_of_reference_uid = None
    frame_of_reference_uid_hash = hashlib.md5(frame_of_reference_uid.encode()).hexdigest()
    data_dict['frame_of_reference_UID'] = frame_of_reference_uid_hash

    try:
        orientation = dcm.PerFrameFunctionalGroupsSequence[0].PlaneOrientationSequence[0].ImageOrientationPatient
    except:
        orientation = None
    data_dict['image_orientation'] = orientation

    try:
        image_type = dcm.AcquisitionDeviceTypeCodeSequence[0].CodeMeaning
    except:
        image_type = None
    data_dict['image_type'] = image_type
    
    data_dict['rows'] = dcm.Rows
    data_dict['columns'] = dcm.Columns

    try:
        frames = dcm.NumberOfFrames
    except:
        frames = None
    data_dict['frames'] = frames
    
    try:
        equipment = dcm.ManufacturerModelName
    except:
        equipment = None
    data_dict['equipment'] = equipment

    image_hash = hashlib.md5(img).hexdigest()
    data_dict['image_hash'] = image_hash

    try:
        location = dcm.AnatomicRegionSequence[0].CodeMeaning
    except:
        location = None
    data_dict['location'] = location

    try:
        description = dcm[0x0008, 0x103e].value
    except:
        description = None
    data_dict['description'] = description

    try:
        acquisition_date = str(dcm.AcquisitionDateTime)[:8]
        data_dict['acquisition_date'] = hashlib.md5(acquisition_date.encode()).hexdigest()
    except:
        data_dict['acquisition_date'] = None
    

    data_dict['image_uuid'] = str(uuid.uuid1())

    return img, data_dict
    return img, pd.DataFrame.from_dict([data_dict])




In [10]:
fpath = os.path.join('data', 'AI_IR', 'NPDR_BE', '001', 'DICOM', 'MC82', '81358', '513871', '8818540', '00000002')

In [28]:
dcm_buf = pydicom.dcmread(fpath)
pxl_hash = hashlib.md5(dcm_buf.pixel_array).hexdigest()
patient_hash = hashlib.md5(dcm_buf.PatientID.encode()).hexdigest()
exam_description = dcm_buf[0x0008, 0x103e].value
frame_of_reference = dcm_buf[0x0020, 0x0052].value
image_orientation = dcm_buf.PerFrameFunctionalGroupsSequence[0].PlaneOrientationSequence[0].ImageOrientationPatient
exam_location = dcm_buf[0x0008, 0x2218][0][0x0008, 0x0104].value
patient_age = (pydicom.valuerep.DT(dcm_buf.AcquisitionDateTime) - pydicom.valuerep.DT(dcm_buf.PatientBirthDate)).days//365.25


In [26]:
print(pxl_hash)
print(patient_hash)
print(exam_description)
print(frame_of_reference)
print(image_orientation)
print(exam_location)
print(patient_age)

adbe14f88fd8d670b410f81296d307c3
9f7f1199d835c574bed73772dcfbc6f9
Section IR 30° ART
1.3.6.1.4.1.33437.11.5.12179244.54026775635138.457.5
[1.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000]
Retina
56.0


In [63]:
img, data = extract_dicom(dcm_buf)

In [64]:
display(pd.DataFrame.from_dict([data]))

Unnamed: 0,patient_hash,laterality,patient_age,frame_of_reference_UID,image_orientation,image_type,rows,cols,frames,equipment,image_hash,location,description,acquisition_date,image_uuid
0,37b1aaa93e9afa512d2e9b786db91b10,R,42,bdbc5c3e98b6ff96c4534369324521d6,"[0.000000, 0.000000, 1.000000, 0.000000, 1.000...",Optical Coherence Tomography Scanner,496,768,1,Spectralis,cb124bb77b40988ddd6fbe8c543d6442,Retina,Section IR 30° ART,1236e6581762af07721d670a11235a7f,4147a39b-c060-494a-9c5d-bd25b4b610ed


In [115]:
dest = os.path.join('original_data', 'images')
if os.path.exists(dest):
    shutil.rmtree(dest)
os.mkdir(dest)


proliferation_list = ['NPDR', 'PDR']
for proliferation in proliferation_list:
    df_list = []
    path_to_dcm = os.path.join('data_structured', proliferation)
    for dcm in tqdm(os.listdir(path_to_dcm)):
        if dcm.startswith('.'):
            continue
        dcm_buf = pydicom.dcmread(os.path.join(path_to_dcm, dcm))
        img, data = extract_dicom(dcm_buf)
        data['proliferation'] = proliferation
        if data['acquisition_date'] is None: # skip corrupted images
            continue
        df_list.append(data)
        plt.imsave(os.path.join(dest, data['image_uuid'] + '.png'), img, cmap='gray')
    df = pd.DataFrame.from_dict(df_list)
    df.sort_values(by=['patient_hash', 'frame_of_reference_UID', 'image_type'], inplace=True)
    df.to_csv(os.path.join('original_data', proliferation + '_after2020.csv'), index=False)

100%|██████████| 330/330 [00:29<00:00, 11.09it/s]
100%|██████████| 146/146 [00:13<00:00, 10.93it/s]


In [116]:
p2020_npdr = pd.read_excel('original_data/prior_2020/ALL_NPDR.xlsx')
p2020_pdr = pd.read_excel('original_data/prior_2020/ALL_PDR.xlsx')
p2020_npdr['proliferation'] = 'NPDR'
p2020_pdr['proliferation'] = 'PDR'
p2020_npdr.rename(columns={'eye': 'laterality'}, inplace=True)
p2020_pdr.rename(columns={'eye': 'laterality'}, inplace=True)
p2020_npdr['laterality'].replace('right', 'R', inplace=True)
p2020_npdr['laterality'].replace('left', 'L', inplace=True)
p2020_pdr['laterality'].replace('right', 'R', inplace=True)
p2020_pdr['laterality'].replace('left', 'L', inplace=True)
p2020_npdr['patient_hash'].fillna(method='ffill', inplace=True)
p2020_pdr['patient_hash'].fillna(method='ffill', inplace=True)
p2020_npdr['laterality'].fillna(method='ffill', inplace=True)
p2020_pdr['laterality'].fillna(method='ffill', inplace=True)

a2020_npdr = pd.read_csv('original_data/NPDR_after2020.csv')
a2020_pdr = pd.read_csv('original_data/PDR_after2020.csv')

new_npdr = pd.concat([p2020_npdr, a2020_npdr], ignore_index=True)
new_pdr = pd.concat([p2020_pdr, a2020_pdr], ignore_index=True)
display(new_npdr.head())

new_npdr.sort_values(by=['patient_hash', 'frame_of_reference_UID', 'image_type'], inplace=True)
new_pdr.sort_values(by=['patient_hash', 'frame_of_reference_UID', 'image_type'], inplace=True)

new_npdr.to_csv('data/NPDR.csv', index=False)
new_pdr.to_csv('data/PDR.csv', index=False)

Unnamed: 0,patient_hash,laterality,patient_age,frame_of_reference_UID,image_orientation,image_type,rows,columns,frames,equipment,image_hash,location,description,acquisition_date,image_uuid,proliferation
0,1f5ddf4c34ee8b37cc6cb201529b67ab,R,38,8e5460c218a3bcf5b5b98c6c52c7a78a,"[0.049289, 0.000000, 0.998785, 0.000000, 1.000...",Optical Coherence Tomography Scanner,496,768,1,Spectralis,8d987acd897e7f8a3601c751d4856397,Retina,Section IR 30° ART,4f13daeff329c8f0f527a1949ac3b75f,ef894351-709d-4fae-86b4-3a7d368d2f5a,NPDR
1,1f5ddf4c34ee8b37cc6cb201529b67ab,R,38,8e5460c218a3bcf5b5b98c6c52c7a78a,,Scanning Laser Ophthalmoscope,768,768,1,Spectralis,eef484e03167e9f70d38b72c572afcbc,Retina,Section IR 30° ART,4f13daeff329c8f0f527a1949ac3b75f,952babeb-b97e-4c67-ae57-3de20862aa8f,NPDR
2,1f5ddf4c34ee8b37cc6cb201529b67ab,R,38,c434e39b521372d128ba8c310cb8aced,"[0.999005, 0.000000, -0.044588, 0.000000, 1.00...",Optical Coherence Tomography Scanner,496,768,1,Spectralis,79ed18bc2b469bcd62275a43cac877ac,Retina,Section IR 30° ART,4f13daeff329c8f0f527a1949ac3b75f,57c82bc6-9d26-4ece-b9b2-64d7553d8622,NPDR
3,1f5ddf4c34ee8b37cc6cb201529b67ab,R,38,c434e39b521372d128ba8c310cb8aced,,Scanning Laser Ophthalmoscope,768,768,1,Spectralis,1a1faaac24de812be4e0f0ff2de311c8,Retina,Section IR 30° ART,4f13daeff329c8f0f527a1949ac3b75f,80253379-df94-4aae-bbe2-319fd7bda6f3,NPDR
4,e8c85dac5d513b04b9f660e87753a0d5,R,55,fe60b849f7a849fc3906fbd52c8aaabe,"[0.999658, 0.000000, -0.026148, 0.000000, 1.00...",Optical Coherence Tomography Scanner,496,768,1,Spectralis,e9bf262907cc078b9f642aa963073d87,Retina,Section IR 30° ART,c38a65dae3e3a3365908600a320ac5a2,65fa4223-573a-4854-b55a-6d619e46bd28,NPDR


In [117]:
# Create a combined csv file
combined = pd.concat([new_npdr, new_pdr], ignore_index=True)
combined.sort_values(by=['patient_hash', 'frame_of_reference_UID', 'image_type'], inplace=True)
combined.to_csv('data/ALL.csv', index=False)