# MIDRC Cohort Selection and Image Downloading For MRI Spine Imaging Studies
---
By Heather Whitney, PhD (Department of Radiology at the University of Chicago, hwhitney at uchicago.edu)

Based on code by Chris Meyer, PhD (Center for Translational Data Science at the University of Chicago)


May 2023

---
This Jupyter notebook tutorial demonstrates how to use the MIDRC data commons' APIs to access imaging study data  and how to access those image files. It was developed to facilitate the Data Science MRI bootcamp held at the 2023 Annual Meeting of the American Association of Physicists in Medicine.

# Python packages:

In [1]:
# The packages below may be necessary for users to install according to the imports necessary in the subsequent cells.

!pip install --upgrade pandas
!pip install --upgrade --ignore-installed PyYAML
!pip install --upgrade pip
!pip install gen3
!pip install --upgrade gen3 --user --upgrade
#!pip install cdiserrors
#!pip install --upgrade pydicom
!pip install pydicom
#!pip install pylibjpeg
#!pip install -U python-gdcm


Collecting PyYAML
  Using cached PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl (192 kB)
Installing collected packages: PyYAML
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.41 requires pathlib, which is not installed.
conda-repo-cli 1.0.41 requires requests_mock, which is not installed.
conda-repo-cli 1.0.41 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.41 requires nbformat==5.4.0, but you have nbformat 5.7.0 which is incompatible.
dictionaryutils 3.0.0 requires PyYAML~=5.1, but you have pyyaml 6.0 which is incompatible.
gdcdictionary 1.2.0 requires PyYAML~=5.1, but you have pyyaml 6.0 which is incompatible.
pypfb 0.5.0 requires PyYAML<6.0.0,>=5.3.1, but you have pyyaml 6.0 which is incompatible.[0m[31m
[0mSuccessfully installed PyYAML-5.4.1
Collecting pip
  Downloading pip-23.2-py3-none-an



# Import Python Packages and scripts

In [2]:
# Import Python Packages and scripts
import pandas as pd
import sys, os, webbrowser
import gen3
import pydicom
import matplotlib.pyplot as plt

from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query

In [3]:
# Import some custom Python scripts from personal GitHub repo
# change these directory paths to reflect your local working directory

home_dir = "/Users/heatherwhitney" 
demo_dir = "{}/Documents/MRIbootcamp".format(home_dir)

os.chdir(demo_dir)

#os.system("wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py -O {}/expansion.py".format(demo_dir))
%run expansion.py


In [4]:
# Initiate instances of the Gen3 SDK Classes using credentials file for authentication
# Set up these credentials using instructions at https://data.midrc.org/dashboard/Public/documentation/Gen3_MIDRC_GetStarted.pdf
# Change the directory path in "cred" to reflect the location where you saved your credentials file.

api = "https://data.midrc.org"
cred = "{}/Documents/MRIbootcamp/credentials.json".format(home_dir)
auth = Gen3Auth(api, refresh_file=cred) # authentication class
sub = Gen3Submission(api, auth) # submission class
query = Gen3Query(auth) # query class
exp = Gen3Expansion(api,auth,sub) # class with some custom scripts
exp.get_project_ids()


Getting all project_ids you have access to in the data commons.
['Open-A1', 'Open-A1_PETAL_REDCORAL', 'Open-R1', 'TCIA-COVID-19-AR', 'TCIA-COVID-19-NY-SBU', 'TCIA-COVID-19_CT_Images', 'TCIA-RICORD']


['Open-A1',
 'Open-A1_PETAL_REDCORAL',
 'Open-R1',
 'TCIA-COVID-19-AR',
 'TCIA-COVID-19-NY-SBU',
 'TCIA-COVID-19_CT_Images',
 'TCIA-RICORD']

In [5]:
# IMAGING STUDY NODE: Export all records in the imaging_study node
st = exp.get_node_tsvs(node = 'imaging_study')
print('\nrows:{}, columns:{}'.format(st.shape[0], st.shape[1]))
t = list(set(st['case_ids']))
display(len(t))
st.head(3)

File previously downloaded.
node_tsvs/imaging_study_tsvs/Open-R1_imaging_study.tsv has 76905 records.
File previously downloaded.
node_tsvs/imaging_study_tsvs/TCIA-COVID-19-NY-SBU_imaging_study.tsv has 7363 records.
File previously downloaded.
node_tsvs/imaging_study_tsvs/TCIA-COVID-19_CT_Images_imaging_study.tsv has 753 records.
File previously downloaded.
node_tsvs/imaging_study_tsvs/TCIA-RICORD_imaging_study.tsv has 1238 records.
File previously downloaded.
node_tsvs/imaging_study_tsvs/TCIA-COVID-19-AR_imaging_study.tsv has 256 records.
File previously downloaded.
node_tsvs/imaging_study_tsvs/Open-A1_imaging_study.tsv has 57211 records.
File previously downloaded.
node_tsvs/imaging_study_tsvs/Open-A1_PETAL_REDCORAL_imaging_study.tsv has 3170 records.
length of all dfs: 146896
Master node TSV with 146896 total records written to master_imaging_study.tsv.

rows:146896, columns:24


58685

Unnamed: 0,type,id,project_id,submitter_id,age_at_imaging,age_at_imaging_gt89,body_part_examined,case_ids,days_from_study_to_neg_covid_test,days_from_study_to_pos_covid_test,...,loinc_method,loinc_system,study_description,study_location,study_modality,study_uid,study_year,study_year_shifted,cases.id,cases.submitter_id
0,imaging_study,0002f8b1-7079-4510-9967-7bdffb3fa155,Open-R1,1.2.826.0.1.3680043.10.474.302028.1411134,2.0,No,CHEST,302028-003859,"484,426,357,331,191,84,0,-108,-187",,...,XR,Chest,XR CHEST 1 VIEW,,CR,1.2.826.0.1.3680043.10.474.302028.1411134,2020.0,False,71a258a3-7566-42b7-a9ce-9b8fac63816f,302028-003859
1,imaging_study,000555ed-a633-4495-a98d-43ada69216a2,Open-R1,1.2.826.0.1.3680043.10.474.419639.104706049142...,61.0,No,CHEST,419639-008711,"-5,-9,-38,-72",,...,XR,Chest,XR CHEST 1 VIEW AP,,CR,1.2.826.0.1.3680043.10.474.419639.104706049142...,,True,3e8d4c3d-887d-4937-95d9-a81a5eae0792,419639-008711
2,imaging_study,0005b825-377f-4170-96e9-8c5d42036bac,Open-R1,1.2.826.0.1.3680043.10.474.302028.1251677,26.0,No,CHEST,302028-002059,"0,-17,-67,-86",,...,XR.portable,Chest,XR CHEST PORTABLE 1 VIEW,,DX,1.2.826.0.1.3680043.10.474.302028.1251677,2020.0,False,c3297960-c430-4b06-a9f5-9d833f4120cf,302028-002059


In [6]:
# IMAGING STUDY NODE: retain those with MR study modality
s = st.loc[(st['study_modality']=='MR') ]
# optional for display
t = list(set(s['case_ids']))
display(len(t))
# print('\nrows:{}, columns:{}'.format(s.shape[0], s.shape[1]))
s.head(5)
display(list(s))
temp = s

133

['type',
 'id',
 'project_id',
 'submitter_id',
 'age_at_imaging',
 'age_at_imaging_gt89',
 'body_part_examined',
 'case_ids',
 'days_from_study_to_neg_covid_test',
 'days_from_study_to_pos_covid_test',
 'days_to_study',
 'loinc_code',
 'loinc_contrast',
 'loinc_long_common_name',
 'loinc_method',
 'loinc_system',
 'study_description',
 'study_location',
 'study_modality',
 'study_uid',
 'study_year',
 'study_year_shifted',
 'cases.id',
 'cases.submitter_id']

## Identify MR imaging studies of the spine

In [7]:
ps = temp.loc[(s['study_description'] == 'MR SPINE LUMBAR WO CONTRAST')]
# option: negative cases only
#ps = temp.loc[ (temp['test_result_text']=='Negative')]
# option: all cases with any COVID test information
#ps = temp;
t = list(set(ps['case_ids']))
display(len(t))
display(ps)

22

Unnamed: 0,type,id,project_id,submitter_id,age_at_imaging,age_at_imaging_gt89,body_part_examined,case_ids,days_from_study_to_neg_covid_test,days_from_study_to_pos_covid_test,...,loinc_method,loinc_system,study_description,study_location,study_modality,study_uid,study_year,study_year_shifted,cases.id,cases.submitter_id
91075,imaging_study,1477a845-04ee-4367-a0a2-258250f7e6b2,Open-A1,2.16.840.1.114274.1818.55812721724815372581189...,41.0,No,,10000364-2397900,178,164147,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.55812721724815372581189...,,True,0401646d-2682-496f-9d57-b859f4f2c8db,10000364-2397900
92304,imaging_study,19f6ee92-ac46-4324-b85a-8d694a1779a3,Open-A1,2.16.840.1.114274.1818.49863489704487727621193...,32.0,No,,10000364-2022035,0,1,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.49863489704487727621193...,,True,4d8b7497-a45f-4383-aeb0-5aee6eae1b65,10000364-2022035
92764,imaging_study,1c0791fa-37de-408d-85e6-1acf127b00ae,Open-A1,2.16.840.1.114274.1818.55816535374091571574831...,50.0,No,,10000364-1058096,,-13,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.55816535374091571574831...,,True,56592e71-b345-4e2c-877c-78213e678297,10000364-1058096
101517,imaging_study,42c456ee-87de-4d4c-9972-f3006d5ced82,Open-A1,2.16.840.1.114274.1818.49363651897124726091109...,70.0,No,,10000364-1437263,,0,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.49363651897124726091109...,,True,36614f91-9336-453c-b2f7-0b1d58465c8e,10000364-1437263
109635,imaging_study,6704d7d5-6f11-4d70-9c7b-e96deadeba66,Open-A1,2.16.840.1.114274.1818.57513909915417576111032...,69.0,No,,10000364-5461045,1851820,12,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.57513909915417576111032...,,True,0cbda608-e393-4826-853c-1a8fa15dca3a,10000364-5461045
111051,imaging_study,6cd59ce8-572e-46f8-9048-88f3c662d80b,Open-A1,2.16.840.1.114274.1818.51383238972914735621593...,45.0,No,,10000364-5314069,-2,108,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.51383238972914735621593...,,True,7f9efb61-c274-4a85-89f6-e6f64d62841f,10000364-5314069
112670,imaging_study,7441f65e-f3ac-413a-b22e-2aed0cfa865a,Open-A1,2.16.840.1.114274.1818.54290417108773167691434...,57.0,No,,10000364-1503940,,-17,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.54290417108773167691434...,,True,1c6bbe26-d778-408e-973f-2476a162421c,10000364-1503940
112888,imaging_study,755113bf-1444-41c8-ab0b-b599959c324b,Open-A1,2.16.840.1.114274.1818.49062652536657148311009...,33.0,No,,10000364-1119749,-2,98,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.49062652536657148311009...,,True,eeb3b5a2-2fb6-44ee-88bc-768957f56fa8,10000364-1119749
113613,imaging_study,7878e164-2c2a-4cb5-b17a-2f2aff039066,Open-A1,2.16.840.1.114274.1818.46697798819023351601590...,78.0,No,,10000364-784361,"-2,-24",20,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.46697798819023351601590...,,True,bc03a400-2bb4-4912-a3f5-055007bae1bc,10000364-784361
114559,imaging_study,7cb2139e-249c-4b14-bb24-bb51583e6807,Open-A1,2.16.840.1.114274.1818.47817901536556695159436...,58.0,No,,10000364-950007,,-120,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.47817901536556695159436...,,True,f8f9d881-f904-44a0-81f5-a65be676d646,10000364-950007


In [8]:
#Saving the data frame to a csv
os.chdir(demo_dir)
filename = 'MR_records.tsv' 
ps.to_csv(filename,sep='\t',index=False)
ps.head(5)

Unnamed: 0,type,id,project_id,submitter_id,age_at_imaging,age_at_imaging_gt89,body_part_examined,case_ids,days_from_study_to_neg_covid_test,days_from_study_to_pos_covid_test,...,loinc_method,loinc_system,study_description,study_location,study_modality,study_uid,study_year,study_year_shifted,cases.id,cases.submitter_id
91075,imaging_study,1477a845-04ee-4367-a0a2-258250f7e6b2,Open-A1,2.16.840.1.114274.1818.55812721724815372581189...,41.0,No,,10000364-2397900,178.0,164147,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.55812721724815372581189...,,True,0401646d-2682-496f-9d57-b859f4f2c8db,10000364-2397900
92304,imaging_study,19f6ee92-ac46-4324-b85a-8d694a1779a3,Open-A1,2.16.840.1.114274.1818.49863489704487727621193...,32.0,No,,10000364-2022035,0.0,1,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.49863489704487727621193...,,True,4d8b7497-a45f-4383-aeb0-5aee6eae1b65,10000364-2022035
92764,imaging_study,1c0791fa-37de-408d-85e6-1acf127b00ae,Open-A1,2.16.840.1.114274.1818.55816535374091571574831...,50.0,No,,10000364-1058096,,-13,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.55816535374091571574831...,,True,56592e71-b345-4e2c-877c-78213e678297,10000364-1058096
101517,imaging_study,42c456ee-87de-4d4c-9972-f3006d5ced82,Open-A1,2.16.840.1.114274.1818.49363651897124726091109...,70.0,No,,10000364-1437263,,0,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.49363651897124726091109...,,True,36614f91-9336-453c-b2f7-0b1d58465c8e,10000364-1437263
109635,imaging_study,6704d7d5-6f11-4d70-9c7b-e96deadeba66,Open-A1,2.16.840.1.114274.1818.57513909915417576111032...,69.0,No,,10000364-5461045,1851820.0,12,...,MR,,MR SPINE LUMBAR WO CONTRAST,,MR,2.16.840.1.114274.1818.57513909915417576111032...,,True,0cbda608-e393-4826-853c-1a8fa15dca3a,10000364-5461045


## Get the imaging files for the identified studies or cases.
---
Now that we have a list of relevant imaging studies t, we can use the study_uid, which is a unique identifier for imaging studies, to collect the associated files. 


In [9]:
## Make a list of study_uids and case_ids

## read in previously saved DataFrame if restarting notebook:
# pd.read_csv(filename, sep='\t', dtype=str)

cids = list(set(ps['case_ids']))
display(len(cids))

sids = list(set(ps['study_uid']))
display(len(sids))
ps.head(5)
list(ps)

22

22

['type',
 'id',
 'project_id',
 'submitter_id',
 'age_at_imaging',
 'age_at_imaging_gt89',
 'body_part_examined',
 'case_ids',
 'days_from_study_to_neg_covid_test',
 'days_from_study_to_pos_covid_test',
 'days_to_study',
 'loinc_code',
 'loinc_contrast',
 'loinc_long_common_name',
 'loinc_method',
 'loinc_system',
 'study_description',
 'study_location',
 'study_modality',
 'study_uid',
 'study_year',
 'study_year_shifted',
 'cases.id',
 'cases.submitter_id']

In [10]:
## This query retrieves ALL imaging_study records, we will next filter these results based on the COVID test data
data = query.raw_data_download(
    data_type="imaging_study",
    fields=[
        "study_uid",
        "case_ids",
        "object_id",
        "project_id",
        "days_to_study",
        "study_modality"
    ],
    sort_fields=[{"case_ids": "asc"}],
    accessibility="accessible"
)

In [11]:
# convert the query data to a DataFrame and remove any records that lack a study_uid or object_id
studies = pd.DataFrame(data)
studies = studies.loc[(~studies['object_id'].isna())&(~studies['study_uid'].isna())]
display(len(studies))
studies.head(5)



137707

Unnamed: 0,days_to_study,study_modality,case_ids,study_uid,project_id,object_id
0,3.0,[CR],[419639-000340],1.2.826.0.1.3680043.10.474.419639.796349784997...,Open-R1,[dg.MD1R/7e6dfa45-dd3a-4275-9522-016a671f3152]
1,-7.0,[CR],[10000364-5525032],2.16.840.1.114274.1818.51687401151122173581269...,Open-A1,"[dg.MD1R/bc1a4a11-1d97-4cf3-bf9c-4a9c964d588a,..."
2,50.0,[CR],[10003752-BeDxoSuS6EGO2d3VaDdxvA],2.16.840.1.114274.1818.56868609356553584055729...,Open-A1,[dg.MD1R/fe5df236-4567-492f-a9b8-fdf110494b6e]
3,5.0,[DX],[302028-006645],1.2.826.0.1.3680043.10.474.302028.2753542,Open-R1,[dg.MD1R/5e408de5-17b1-4a02-9ec3-109a827f1160]
4,-190.0,[CT],[10000364-796304],2.16.840.1.114274.1818.48559242508824544311053...,Open-A1,"[dg.MD1R/24504ade-a4a2-44ca-87e6-77c4ba115452,..."


In [12]:
# Now filter the imaging studies 
covid_studies = studies.loc[(studies['study_uid'].isin(sids))]
print('\n unique imaging studies:{}'.format(len(covid_studies)))
covid_studies.head(5)
list(covid_studies)


 unique imaging studies:22


['days_to_study',
 'study_modality',
 'case_ids',
 'study_uid',
 'project_id',
 'object_id']

In [13]:
# save our result to a csv
filename = "MRI.tsv"
covid_studies.to_csv(filename, sep='\t', index=False)
covid_studies.head(22)
object_ids = list(set([a for b in covid_studies.object_id.tolist() for a in b]))
#object_ids = list(set([a for b in covid_studies.object_id.tolist() for a in b]))
len(object_ids)

134

## Now that we have a list of file object_ids for the desired imaging studies, we can use the Gen3 SDK "drs-pull" commands to access the files themselves.
---
First, we'll create a manifest.json file using a [simple script](https://github.com/cgmeyer/gen3sdk-python/blob/389e3945482439ace6e4536e6d0e35c6e48de9c9/expansion/expansion.py#L2575). Then we'll use the `gen3 drs-pull manifest` command to download the files.

See the detailed documentation to learn more about the Gen3 SDK drs-pull command: https://github.com/uc-cdis/gen3sdk-python/blob/master/docs/howto/drsDownloading.md


In [14]:
# Save the manifest of file object_ids to a JSON file
mani_name = 'MRImanifest20230704.json'
exp.write_manifest(guids=object_ids, filename=mani_name)


	Done (134/134).
	Manifest written to file: MRImanifest20230704.json


'MRImanifest20230704.json'

In [15]:
# To download all files in the manifest, use the "gen3 drs-pull manifest" command
download_dir = "{}/images".format(demo_dir)

if not os.path.exists(download_dir):
    os.makedirs(download_dir)
    
cmd = "gen3 --auth {} --endpoint data.midrc.org drs-pull manifest {} {}".format(cred, mani_name, download_dir)
print(cmd)


## Option 1: To monitor the progress in real-time, you can copy the command generated below
## and run in your terminal instead of from this Jupyter Notebook.

gen3 --auth /Users/heatherwhitney/Documents/MRIbootcamp/credentials.json --endpoint data.midrc.org drs-pull manifest MRImanifest20230704.json /Users/heatherwhitney/Documents/MRIbootcamp/images


In [16]:
## Option 2: Run the manifest download command. 
## Note that this will take some time if the manifest is very large. 
subprocess.run(cmd, shell=True, capture_output=True)

CompletedProcess(args='gen3 --auth /Users/heatherwhitney/Documents/MRIbootcamp/credentials.json --endpoint data.midrc.org drs-pull manifest MRImanifest20230704.json /Users/heatherwhitney/Documents/MRIbootcamp/images', returncode=0, stdout=b'', stderr=b'\rResolving objects:   0%|          | 0/134 [00:00<?, ?it/s]\rResolving objects:   1%|          | 1/134 [00:00<00:39,  3.34it/s]\rResolving objects:   1%|\xe2\x96\x8f         | 2/134 [00:00<00:36,  3.58it/s]\rResolving objects:   2%|\xe2\x96\x8f         | 3/134 [00:00<00:34,  3.79it/s]\rResolving objects:   3%|\xe2\x96\x8e         | 4/134 [00:01<00:35,  3.63it/s]\rResolving objects:   4%|\xe2\x96\x8e         | 5/134 [00:01<00:34,  3.69it/s]\rResolving objects:   4%|\xe2\x96\x8d         | 6/134 [00:01<00:33,  3.83it/s]\rResolving objects:   5%|\xe2\x96\x8c         | 7/134 [00:01<00:32,  3.87it/s]\rResolving objects:   6%|\xe2\x96\x8c         | 8/134 [00:02<00:36,  3.46it/s]\rResolving objects:   7%|\xe2\x96\x8b         | 9/134 [00:02<00:3