In [1]:
import os
import sys
import glob  
import pathlib

import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
pd.set_option('precision', 3)
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)
pd.set_option('max_colwidth', None)
pd.set_option('expand_frame_repr', True)

# MIMIC-CXR-JPG Database v2.0.0 ([link](https://physionet.org/content/mimic-cxr-jpg/2.0.0/))
- JPG version of [MIMIC-CXR](https://physionet.org/content/mimic-cxr/2.0.0/) based on this offical [github](https://github.com/MIT-LCP/mimic-cxr) repository.
- A set of 10 folders, each with ~6,500 sub-folders corresponding to all the JPG format images for an individual patient.
- `mimic-cxr-2.0.0-metadata.csv.gz` - a compressed CSV file providing useful metadata for the images including view position, patient orientation, and an anonymized date of image acquisition time allowing chronological ordering of the images.
- `mimic-cxr-2.0.0-split.csv.gz` - a compressed CSV file providing recommended train/validation/test data splits.
- `mimic-cxr-2.0.0-chexpert.csv.gz` - a compressed CSV file listing all studies with labels generated by the CheXpert labeler.
- `mimic-cxr-2.0.0-negbio.csv.gz` - a compressed CSV file listing all studies with labels generated by the NegBio labeler.

In [3]:
DATA_DIRECTORY = os.path.abspath("../../physionet.org/files/mimic-cxr-jpg/2.0.0/")

print(DATA_DIRECTORY)
for f in sorted(os.listdir(DATA_DIRECTORY)):
    print('    +--', f)

/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0
    +-- LICENSE.txt
    +-- README
    +-- SHA256SUMS.txt
    +-- files
    +-- index.html
    +-- mimic-cxr-2.0.0-chexpert.csv.gz
    +-- mimic-cxr-2.0.0-metadata.csv.gz
    +-- mimic-cxr-2.0.0-negbio.csv.gz
    +-- mimic-cxr-2.0.0-split.csv.gz


## 1. Dataset Description (using `mimic-cxr-2.0.0-metadata.csv.gz`)
A unique sample can be identified with the values of three columns; `subject_id`, `study_id`, and `dicom_id`.
Images are provided in individual folders. An example of the folder structure for a single patient's images is as follows:
```bash
files/
+-- p10/
|   +-- p10000032/
|       +-- s50414267/
|              02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg
|              174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg
|       +-- s53189527/
|              2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg
|              e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c.jpg
|       +-- s53911762/
|              68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714.jpg
|              fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818.jpg
|       +-- s56699142/
|              ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c.jpg
```
For a single patient `p10000032`, it is located under `p10` based on the first three characters. This patient has four radiographic studies, `s50414267`, `s53189527`, `s53911762`, and `s56699142`. The study identifiers are completely random; they do not indicate any chronological order between the studies. Each study has two chest x-rays associated with it, except for `s56699142`, which has only one study. Only the original [MIMIC-CXR](https://physionet.org/content/mimic-cxr/2.0.0/) database contains the original free-text radiology reports.

In [4]:
%%time
metadata = pd.read_csv(os.path.join(DATA_DIRECTORY, 'mimic-cxr-2.0.0-metadata.csv.gz'), compression='gzip')
metadata = metadata.set_index(['subject_id', 'study_id', 'dicom_id'], drop=True, append=False, inplace=False)
display(metadata.head(7))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
subject_id,study_id,dicom_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
10000032,50414267,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
10000032,53189527,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,
10000032,53911762,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,CHEST (PORTABLE AP),AP,2906,2258,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,Erect
10000032,56699142,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424.765,CHEST (PORTABLE AP),antero-posterior,


CPU times: user 3.43 s, sys: 77.6 ms, total: 3.5 s
Wall time: 1.32 s


### 1-1. Tracking input JPG files (Method 1)

In [5]:
%%time

def get_jpg(row: pd.DataFrame, root: str):
    """Create JPG file string."""
    subject_id, study_id, dicom_id = row.name
    subject_id = 'p' + str(subject_id)
    study_id = 's' + str(study_id)
    jpg_file = f"{subject_id[:3]}/{subject_id}/{study_id}/{dicom_id}.jpg"
    jpg_file = os.path.join(root, jpg_file)
    if not os.path.exists(jpg_file):
        return None
    else:
        return jpg_file

# The exact file paths can also be inferred from the meta data.
metadata['JPG_FILE'] = metadata.apply(get_jpg, axis=1, root=os.path.join(DATA_DIRECTORY, 'files'))
display(metadata.head(7))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,JPG_FILE
subject_id,study_id,dicom_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect,/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg
10000032,50414267,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect,/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg
10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect,/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg
10000032,53189527,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect,/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s53189527/e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c.jpg
10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,,/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s53911762/68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714.jpg
10000032,53911762,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,CHEST (PORTABLE AP),AP,2906,2258,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,Erect,/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s53911762/fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818.jpg
10000032,56699142,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424.765,CHEST (PORTABLE AP),antero-posterior,,/workspace/MimicStorage/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p10000032/s56699142/ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c.jpg


CPU times: user 29.7 s, sys: 1.68 s, total: 31.4 s
Wall time: 44.5 s


### 1-2. Tracking input JPG files (Method 2)

In [6]:
class DataHandler(object):
    
    @classmethod
    def scan_jpg_files(cls, root: str):
        for entry in os.scandir(root):
            if entry.is_file() and entry.name.endswith('.jpg'):
                yield os.path.join(root, entry.name)
            elif entry.is_dir():
                yield from cls.scan_jpg_files(entry.path)
            else:
                pass
            
    @classmethod
    def walk_jpg_files(cls, root: str):
        for directory, _, filenames in os.walk(root):
            for f in filenames:
                if f.endswith('.jpg'):
                    yield os.path.abspath(os.path.join(directory, f))
                elif os.path.isdir(f):
                    yield from cls.walk_jpg_files(f)
                else:
                    pass

    @staticmethod
    def get_dicom_id(jpg_path: str):
        if not jpg_path.endswith('.jpg'):
            raise ValueError
        return pathlib.Path(jpg_path).name.replace('.jpg', '')
    
    @staticmethod
    def get_study_id(jpg_path: str):
        if not jpg_path.endswith('.jpg'):
            raise ValueError
        return pathlib.Path(jpg_path).parent.name.replace('s', '')
    
    @staticmethod
    def get_subject_id(jpg_path: str):
        if not jpg_path.endswith('.jpg'):
            raise ValueError
        return pathlib.Path(jpg_path).parent.parent.name.replace('p', '')
    

In [7]:
%%time
jpg_files = [j for j in DataHandler.scan_jpg_files(DATA_DIRECTORY)]
print(f"Number of JPG images: {len(jpg_files):,}")

Number of JPG images: 377,109
CPU times: user 8.65 s, sys: 9.19 s, total: 17.8 s
Wall time: 6min 14s


In [8]:
DataHandler.get_dicom_id(jpg_files[0])

'bacd6234-0b2bd919-6e4fbfe4-c4aa4c1d-9c3a805a'

In [9]:
DataHandler.get_study_id(jpg_files[0])

'56262531'

In [10]:
DataHandler.get_subject_id(jpg_files[0])

'10989188'

## 1-3. Class Labels (using `mimic-cxr-2.0.0-chexpert.csv.gz`)

In [11]:
chexpert = pd.read_csv(os.path.join(DATA_DIRECTORY, 'mimic-cxr-2.0.0-chexpert.csv.gz'), compression='gzip')
display(chexpert.head(7))