In [1]:
import pandas as pd
import numpy as np
import pydicom
import glob

In [2]:
## First, read all of my DICOM files into a list
mydicoms = glob.glob("*.dcm")

### Let's look at the contents of the first DICOM:

In [3]:
dcm1 = pydicom.dcmread(mydicoms[0])
dcm1

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 204
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: Secondary Capture Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.3.6.1.4.1.11129.5.5.166464213123411798454010159008736232327346
(0002, 0010) Transfer Syntax UID                 UI: Implicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.2.826.0.1.3680043.8.498.1
(0002, 0013) Implementation Version Name         SH: 'PYDICOM 1.2.0'
-------------------------------------------------
(0008, 0016) SOP Class UID                       UI: Secondary Capture Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.11129.5.5.166464213123411798454010159008736232327346
(0008, 0060) Modality                            CS: 'DX'
(0008, 1030) Study Description                   LO: 'Consolidation|Mass|Pneumonia|Pn

In [4]:
## Do some exploratory work before about how to extract these attributes using pydicom... 
dcm1["Columns"]
dcm1[0x0008,0x1030].keyword

dcm1.PatientID

'13659'

## Now, let's create the dataframe that we want, and populate it in a loop with all of our DICOMS:

To complete this exercise, create a single dataframe that has the following columns:
- Patient ID
- Patient Age (as an integer)
- Patient Sex (M/F)
- Imaging Modality
- Type of finding in the image
- Number of rows in the image
- Number of columns in the image

Save this dataframe as a .CSV file.

In [5]:
column_name = ["PatientID", "PatientSex",'PatientAge','Modality','StudyDescription','Rows','Columns']
summary_db = pd.DataFrame(columns=column_name)

In [6]:
for adicom in mydicoms:
    dcm = pydicom.dcmread(adicom)
    
    TempDict = dict.fromkeys(column_name, None)
    TempDict["PatientID"]= dcm.PatientID
    TempDict["PatientSex"]= dcm.PatientSex
    TempDict["PatientAge"]= dcm.PatientAge
    TempDict["Modality"]= dcm.Modality
    TempDict["StudyDescription"]= dcm.StudyDescription
    TempDict["Rows"]= dcm.Rows
    TempDict["Columns"]= dcm.Columns
    
    new_row = pd.Series(TempDict, name=adicom)
    summary_db = summary_db.append(new_row)
    

In [7]:
summary_db

Unnamed: 0,PatientID,PatientSex,PatientAge,Modality,StudyDescription,Rows,Columns
dicom_00013659_019.dcm,13659,F,62,DX,Consolidation|Mass|Pneumonia|Pneumothorax,1024,1024
dicom_00023075_033.dcm,23075,M,31,DX,Mass,1024,1024
dicom_00029579_005.dcm,29579,F,24,DX,Effusion|Nodule|Pleural_Thickening|Mass,1024,1024


# Another method

In [8]:
all_data = []

for i in mydicoms: 
    dcm = pydicom.dcmread(i)
    fields = [dcm.PatientID, int(dcm.PatientAge), dcm.PatientSex, dcm.Modality, dcm.StudyDescription,
             dcm.Rows, dcm.Columns]
    all_data.append(fields)

In [9]:
mydata = pd.DataFrame(all_data, 
                      columns = ['PatientID','PatientAge','PatientSex','Modality','Findings','Rows','Columns'])

In [10]:
mydata

Unnamed: 0,PatientID,PatientAge,PatientSex,Modality,Findings,Rows,Columns
0,13659,62,F,DX,Consolidation|Mass|Pneumonia|Pneumothorax,1024,1024
1,23075,31,M,DX,Mass,1024,1024
2,29579,24,F,DX,Effusion|Nodule|Pleural_Thickening|Mass,1024,1024
