# Create JSON data list 

In this notebook, I want to create the JSON datalist as exemplified by the rest example_config_datalist: 

In [102]:
import os
import glob
import json
from pathlib import Path
import pandas as pd
import subprocess as sub 
from subprocess import Popen, PIPE, STDOUT

# Example file: 

In [2]:
with open('example_config_datalist.json') as json_file:
    example_list = json.load(json_file)

In [3]:
## Need to mimic the structure of this 'validation' list 
example_list['validation']

[{'label': 'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_seg.nii.gz',
  'image': ['training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_t1ce.nii.gz',
   'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_t1.nii.gz',
   'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_t2.nii.gz',
   'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_flair.nii.gz']},
 {'label': 'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_seg.nii.gz',
  'image': ['training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_t1ce.nii.gz',
   'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_t1.nii.gz',
   'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_t2.nii.gz',
   'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_flair.nii.gz']},
 {'label': 'training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_seg.nii.gz',
  'image': ['training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_t1ce.nii.gz',
   'training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_t1.nii.gz',
   'training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_t2.nii.gz',
   'training/HGG/Brats18_2013_10_1/Brats

In [4]:
## It is a list of dictionaries that have 'label' keys and 'image' keys; label key will be left blank 
## Image key will be list of stirngs describing nifti input 
## Must be in the following order: 1) t1c 2) t1 3) t2 4) t2flair 

# Look at the niftis that we have: 

In [5]:
root_dir = Path('/working/lupolab/julia/tcia_analysis/')

In [31]:
lgg_patients = [x.split('/')[-1] for x in glob.glob(str(root_dir)+'/datasets/TCGA-LGG-nifti/*')]

In [32]:
lgg_patients.remove('Register_Niftis.ipynb')

In [33]:
len(lgg_patients)

167

Remove patients with fewer than 4 nifti files from list: 

In [38]:
for i in lgg_patients: 
#     print(str(root_dir)+ '/datasets/TCGA-LGG-nifti/'+i+"/*")
#     print(glob.glob(str(root_dir)+ '/datasets/TCGA-LGG-nifti/'+i+"/*/*"))
    if len(glob.glob(str(root_dir)+ '/datasets/TCGA-LGG-nifti/'+i+"/*/*")) < 4: 
        lgg_patients.remove(i)

In [39]:
len(lgg_patients)

161

Check to see if the patients have the images we're looking for: 

In [46]:
## First add them to a dictionary: 

## We need a decent amount of logic to figure out if we have the right files 

In [93]:
potential_patient_images = {}
for i in lgg_patients: 
    patient_images = glob.glob(str(root_dir)+ '/datasets/TCGA-LGG-nifti/'+i+"/*/*")

    
    t1_post_images = list(filter(lambda x: 't1c' in x.lower() or ('t1' in x.lower() and 'post'  in x.lower()), patient_images))
    t1_pre_images = list(filter(lambda x: 'mp' in x.lower() or ('t1' in x.lower() and x not in t1_post_images), patient_images))
    t2_flair_images = list(filter(lambda x: 'flair' in x.lower() and 't1' not in x.lower(), patient_images))
    t2_fse_images = list(filter(lambda x: 't2' in x.lower() and x not in t2_flair_images, patient_images))
    
#     ## find axial images if possible: 
    
#     ax_t1_post_images = list(filter(lambda x: 'ax' in x.lower(), t1_post_images))
#     ax_t1_pre_images = list(filter(lambda x: 'ax' in x.lower(), t1_pre_images))
#     ax_t2_flair_images = list(filter(lambda x: 'ax' in x.lower(), t2_flair_images))
#     ax_t2_fse_images = list(filter(lambda x: 'ax' in x.lower(), t2_fse_images))
    
#     if len(ax_t1_post_images) > 0: 
#         t1_post_images = ax_t1_post_images
#     if len(ax_t1_pre_images) >0: 
#         t1_pre_images = ax_t1_pre_images
#     if len(ax_t2_flair_images) > 0: 
#         t2_flair_images = ax_t2_flair_images
#     if len(ax_t2_fse_images) >0: 
#         t2_fse_images = ax_t2_fse_images
    
    potential_patient_images[i] = {
            't1c': t1_post_images, 
            't1': t1_pre_images,
            't2_flair': t2_flair_images, 
            't2_fse': t2_fse_images
    }
        

## First we actually need to build something that can use dcmdump to figure out whether there has been contrast administered: 

In [136]:
for i in lgg_patients: 
    patient_images = glob.glob(str(root_dir)+ '/datasets/TCGA-LGG/'+i+"/*/*")
    t1_images = list(filter(lambda x: 't1' in x.lower(), patient_images))
    for j in t1_images: 
        os.chdir(j)
        print(j)
        command = "dcmdump 000000.dcm | grep -i '(0018,0010)' "
        result = sub.Popen(command, stdout=sub.PIPE, stderr = sub.PIPE, shell = True)
        out, err = result.communicate()
        out = out.decode('utf-8')
        if out: 
            

/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1201-T1_COR_SE-91770
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/501-T1W__SE-58849
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1301-T1_SAG_SE-45175
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1101-T1_AX_SE-54608
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1001-T1_AX__SE-75237
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-6669/01-02-2002-MRI_BRAIN_COMBO-45633/1101-T1_SAG_SE-52125
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-6669/01-02-2002-MRI_BRAIN_COMBO-45633/901-T1_AX_SE-98053
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-CS-6669/01-02-2002-MRI_BRAI

/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-FG-7637/09-22-2000-NR_MRI_BRAIN_WWO-24828/9-t1tirSAG-00821
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-FG-7637/09-22-2000-NR_MRI_BRAIN_WWO-24828/6-t1tircor-58644
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-7604/10-28-1995-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85806/3-SAG_T1-29597
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-7604/10-28-1995-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85806/11-COR_T1C-37414
yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-7604/10-28-1995-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85806/5-AX_T1-70448
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-7604/10-28-1995-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85806/10-SAG_T1C-39537
yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-8104/09-06-1997-MRI_BRAIN_WCONTRAST-02815/4-COR_T1_SE-40006
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-8104/09-06-1997

yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/02-16-2002-MRI_BRAIN__W_WO_CONT-21931/2-T1SAG-84487
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/02-16-2002-MRI_BRAIN__W_WO_CONT-21931/6-T1SAG-98747
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/02-16-2002-MRI_BRAIN__W_WO_CONT-21931/5-T1AXMPRSEL3D-11837
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/02-16-2002-MRI_BRAIN__W_WO_CONT-21931/17-T1SAGTSEFCREAD-69280
yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/02-16-2002-MRI_BRAIN__W_WO_CONT-21931/19-T1AXMPRSEL3D-76305
yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/09-17-2001-MRI_BRAIN__W_WO_CONT-92660/7-T1AXMPRSEL3D-32845
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/09-17-2001-MRI_BRAIN__W_WO_CONT-92660/10-T1AXMPRSEL3D-83348
yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-EZ-7265A/09-17-2001-MRI_BRAIN__W_

/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-DU-A6S3/07-11-1998-MRI_BRAIN_WWO_CONTRAST-51715/801-AX_T1WIRTSE-45170
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-DU-A6S3/07-11-1998-MRI_BRAIN_WWO_CONTRAST-51715/1302-POST_AX_T1_BRAIN_LAB_1MM-00396
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-A5R7/08-27-1999-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85543/7-Ax_T1_MP_SPGR-34231
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-A5R7/08-27-1999-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85543/11-C_Ax_T1_MP_SPGR-45507
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-A5R7/08-27-1999-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85543/9-C_SAG_T1_SE-69743
yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-A5R7/08-27-1999-MRI_BRAIN_FOR_STEREOTACTIC_WWO_CONTR-85543/10-C_COR_T1_SE-51918
yay!
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-HT-8013/09-01-1992-MRI_BRAIN_FOR_STEREOTACTI-42573/5-COR_T1_SE_F__C-13129
yay!
/

KeyboardInterrupt: 

In [127]:
j = "/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/12-COR__T1_POST_GD_FLAIR-32689"
command = "dcmdump 000000.dcm | grep -i '(0018,0010)' "
# command = command.split()
command

"dcmdump 000000.dcm | grep -i '(0018,0010)' "

In [128]:
result = sub.Popen(command, stdout=sub.PIPE, stderr = sub.PIPE, shell = True)

In [129]:
result

<subprocess.Popen at 0x7fe19c5613c8>

In [130]:
out, err = result.communicate()

In [131]:
out

b'(0018,0010) LO [20ML  MULTIHANCE]                       #  16, 1 ContrastBolusAgent\n'

In [94]:
potential_patient_images

{'TCGA-CS-5396': {'t1c': [],
  't1': ['/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1101-T1_AX_SE-54608.nii.gz',
   '/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/501-T1W__SE-58849.nii.gz',
   '/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1201-T1_COR_SE-91770.nii.gz',
   '/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1301-T1_SAG_SE-45175.nii.gz',
   '/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/1001-T1_AX__SE-75237.nii.gz'],
  't2_flair': ['/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/601-T2_AX_FLAIR-98890.nii.gz'],
  't2_fse': ['/working/lupolab

In [95]:
## Iterate over patients in the dictionary and check if they have empty entries 

In [87]:
keys_with_zero_entries = []

In [88]:
for key, value in potential_patient_images.items():   
    # iterate through the items in each dictionary
    for key_image, value_image in value.items(): 
        # if there are zero length lists in an image key for a certain patient
        # we want to remove them from the dictionary: 
        if len(value_image) ==0: 
            keys_with_zero_entries.append((key, key_image))
            continue

In [89]:
keys_with_zero_entries

[('TCGA-CS-5396', 't1c'),
 ('TCGA-CS-6669', 't1c'),
 ('TCGA-HT-7480', 't1c'),
 ('TCGA-HT-7480', 't2_flair'),
 ('TCGA-FG-A6J1', 't1c'),
 ('TCGA-HT-8010', 't1c'),
 ('TCGA-HT-8010', 't2_flair'),
 ('TCGA-DU-6404', 't2_fse'),
 ('TCGA-HT-A4DV', 't1c'),
 ('TCGA-HT-A4DV', 't2_flair'),
 ('TCGA-HT-A4DV', 't2_fse'),
 ('TCGA-CS-6670', 't1c'),
 ('TCGA-HT-7477', 't1c'),
 ('TCGA-FG-A4MT', 't1c'),
 ('TCGA-FG-7637', 't1c'),
 ('TCGA-HT-8104', 't1c'),
 ('TCGA-HT-8104', 't2_flair'),
 ('TCGA-HT-8104', 't2_fse'),
 ('TCGA-HT-8109', 't1c'),
 ('TCGA-HT-7603', 't1c'),
 ('TCGA-HT-7603', 't2_flair'),
 ('TCGA-HT-7609', 't1c'),
 ('TCGA-HT-7609', 't2_flair'),
 ('TCGA-HT-7609', 't2_fse'),
 ('TCGA-FG-5964', 't1c'),
 ('TCGA-HT-A5RA', 't1c'),
 ('TCGA-HT-A5RA', 't2_flair'),
 ('TCGA-HT-A5RA', 't2_fse'),
 ('TCGA-HT-7470', 't1c'),
 ('TCGA-HT-7470', 't2_flair'),
 ('TCGA-FG-6688', 't1c'),
 ('TCGA-HT-7857', 't1c'),
 ('TCGA-HT-7857', 't2_flair'),
 ('TCGA-HT-7857', 't2_fse'),
 ('TCGA-HT-7881', 't1c'),
 ('TCGA-HT-8110', 't1c'),
 

In [90]:
patients_to_eliminate = pd.Series([x[0] for x in keys_with_zero_entries]).unique()

In [91]:
len(patients_to_eliminate)

95

In [96]:
patients_to_eliminate

array(['TCGA-CS-5396', 'TCGA-CS-6669', 'TCGA-HT-7480', 'TCGA-FG-A6J1',
       'TCGA-HT-8010', 'TCGA-DU-6404', 'TCGA-HT-A4DV', 'TCGA-CS-6670',
       'TCGA-HT-7477', 'TCGA-FG-A4MT', 'TCGA-FG-7637', 'TCGA-HT-8104',
       'TCGA-HT-8109', 'TCGA-HT-7603', 'TCGA-HT-7609', 'TCGA-FG-5964',
       'TCGA-HT-A5RA', 'TCGA-HT-7470', 'TCGA-FG-6688', 'TCGA-HT-7857',
       'TCGA-HT-7881', 'TCGA-HT-8110', 'TCGA-HT-7689', 'TCGA-HT-7610',
       'TCGA-HT-A617', 'TCGA-FG-6691', 'TCGA-FG-8186', 'TCGA-HT-7902',
       'TCGA-DU-7298', 'TCGA-HT-A5R7', 'TCGA-HT-8013', 'TCGA-HT-7687',
       'TCGA-HT-A619', 'TCGA-HT-7467', 'TCGA-HT-7483', 'TCGA-HT-8107',
       'TCGA-HT-A61A', 'TCGA-HT-7607', 'TCGA-FG-7643', 'TCGA-FG-7634',
       'TCGA-DU-6395', 'TCGA-HT-7474', 'TCGA-HT-7854', 'TCGA-HT-7479',
       'TCGA-HT-A5RB', 'TCGA-HT-7693', 'TCGA-HT-7677', 'TCGA-DU-8158',
       'TCGA-DU-6400', 'TCGA-HT-A614', 'TCGA-CS-6188', 'TCGA-HT-7875',
       'TCGA-DU-7008', 'TCGA-DU-6405', 'TCGA-DU-7014', 'TCGA-FG-A87N',
      