# Create JSON data list 

In this notebook, I want to create the JSON datalist as exemplified by the rest example_config_datalist: 

In [1]:
import os
import glob
import json
from pathlib import Path
import pandas as pd
import subprocess as sub 
from subprocess import Popen, PIPE, STDOUT

# Example file: 

In [2]:
with open('example_config_datalist.json') as json_file:
    example_list = json.load(json_file)

In [3]:
example_list

{'training': [{'label': 'training/HGG/Brats18_2013_2_1/Brats18_2013_2_1_seg.nii.gz',
   'image': ['training/HGG/Brats18_2013_2_1/Brats18_2013_2_1_t1ce.nii.gz',
    'training/HGG/Brats18_2013_2_1/Brats18_2013_2_1_t1.nii.gz',
    'training/HGG/Brats18_2013_2_1/Brats18_2013_2_1_t2.nii.gz',
    'training/HGG/Brats18_2013_2_1/Brats18_2013_2_1_flair.nii.gz']},
  {'label': 'training/HGG/Brats18_2013_4_1/Brats18_2013_4_1_seg.nii.gz',
   'image': ['training/HGG/Brats18_2013_4_1/Brats18_2013_4_1_t1ce.nii.gz',
    'training/HGG/Brats18_2013_4_1/Brats18_2013_4_1_t1.nii.gz',
    'training/HGG/Brats18_2013_4_1/Brats18_2013_4_1_t2.nii.gz',
    'training/HGG/Brats18_2013_4_1/Brats18_2013_4_1_flair.nii.gz']},
  {'label': 'training/HGG/Brats18_2013_7_1/Brats18_2013_7_1_seg.nii.gz',
   'image': ['training/HGG/Brats18_2013_7_1/Brats18_2013_7_1_t1ce.nii.gz',
    'training/HGG/Brats18_2013_7_1/Brats18_2013_7_1_t1.nii.gz',
    'training/HGG/Brats18_2013_7_1/Brats18_2013_7_1_t2.nii.gz',
    'training/HGG/Brat

In [5]:
## Need to mimic the structure of this 'validation' list 
example_list['validation']

[{'label': 'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_seg.nii.gz',
  'image': ['training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_t1ce.nii.gz',
   'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_t1.nii.gz',
   'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_t2.nii.gz',
   'training/HGG/Brats18_2013_3_1/Brats18_2013_3_1_flair.nii.gz']},
 {'label': 'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_seg.nii.gz',
  'image': ['training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_t1ce.nii.gz',
   'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_t1.nii.gz',
   'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_t2.nii.gz',
   'training/HGG/Brats18_2013_5_1/Brats18_2013_5_1_flair.nii.gz']},
 {'label': 'training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_seg.nii.gz',
  'image': ['training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_t1ce.nii.gz',
   'training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_t1.nii.gz',
   'training/HGG/Brats18_2013_10_1/Brats18_2013_10_1_t2.nii.gz',
   'training/HGG/Brats18_2013_10_1/Brats

In [6]:
## It is a list of dictionaries that have 'label' keys and 'image' keys; label key will be left blank 
## Image key will be list of stirngs describing nifti input 
## Must be in the following order: 1) t1c 2) t1 3) t2 4) t2flair 

## Use results from labeling (that we did during registration)

In [7]:
label_df_lgg = pd.read_csv('/working/lupolab/julia/tcia_analysis/labels_tcia_lgg.csv')

In [8]:
label_df_gbm = pd.read_csv('/working/lupolab/julia/tcia_analysis/labels_tcia_GBM.csv')

In [9]:
full_label_df = label_df_lgg.append(label_df_gbm, ignore_index = True)

In [10]:
full_label_df.image[0]

'/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-CS-5396/03-02-2001-MRI_BRAIN_CONTRAST_MRA_BRAIN-47605/305-RT__COW-39894.nii.gz'

In [11]:
full_label_df['patient_id'] = [x.split('/')[7] for x in full_label_df.image]

In [12]:
patient_list = full_label_df.patient_id.unique()

In [13]:
usable_patients = []
for patient in patient_list: 
    patient_df = full_label_df.loc[(full_label_df['patient_id'] == patient) & (full_label_df['ground_truth'] != "OTHER")]
#     print(patient_df)
    if len(patient_df.ground_truth.unique()) >= 4: 
        usable_patients.append(patient)

In [14]:
len(usable_patients)

214

In [15]:
usable_patient_df = full_label_df.loc[(full_label_df['patient_id'].isin(usable_patients)) & 
                                      (full_label_df['ground_truth'] != "OTHER")].copy()

In [26]:
usable_patient_df.index

Int64Index([  39,   41,   43,   44,   46,   47,   48,   52,   53,   54,
            ...
            6204, 6205, 6207, 6208, 6209, 6210, 6212, 6213, 6214, 6215],
           dtype='int64', length=2607)

In [28]:
from collections import Counter

In [33]:
len_counts = Counter()

In [36]:
usable_patient_df['aligned_image'] = ['' for x in usable_patient_df.image]

In [40]:
for idx, row in usable_patient_df.iterrows():
    dir_name = '/'.join(row['image'].split('/')[0:-1])
    image_name =row['image'].split('/')[-1]
    image_name_split = image_name.split('.')
    image_name_split[-3] = image_name_split[-3]+'a'
    image_name_aligned = '.'.join(image_name_split)
    if image_name_aligned in os.listdir(dir_name): 
        row['aligned_image'] = dir_name+'/'+image_name_aligned
    
    

In [44]:
usable_patient_df.aligned_image.value_counts().head()

                                                                                                                                                                              43
/working/lupolab/julia/tcia_analysis/datasets/TCGA-GBM-nifti/TCGA-14-1459/03-09-2000-HeadRoutine-71375/7-t1tra_pre-87768a.nii.gz                                               1
/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-HT-8113/08-09-1993-MRI_BRAIN_FOR_STEREOTACTI-59884/3-AX_T1-07039a.nii.gz                                     1
/working/lupolab/julia/tcia_analysis/datasets/TCGA-GBM-nifti/TCGA-14-0783/08-17-1992-BRAIN_W-90840/501-Brain______T2_AX_GRASE__TRA____GraSE_5000______100___-67222a.nii.gz     1
/working/lupolab/julia/tcia_analysis/datasets/TCGA-GBM-nifti/TCGA-06-0133/07-28-2005-41762/6-AXIAL_FLAIR-36485a.nii.gz                                                         1
Name: aligned_image, dtype: int64

In [45]:
len_counts

Counter({3: 2573, 4: 34})

In [46]:
usable_patient_df.head()

Unnamed: 0,ground_truth,image,sd,patient_id,aligned_image
39,T1C,/working/lupolab/julia/tcia_analysis/datasets/...,12-COR__T1_POST_GD_FLAIR-32689,TCGA-DU-5853,/working/lupolab/julia/tcia_analysis/datasets/...
41,T1C,/working/lupolab/julia/tcia_analysis/datasets/...,11-AX_T1_POST_GD_FLAIR-81300,TCGA-DU-5853,/working/lupolab/julia/tcia_analysis/datasets/...
43,T1,/working/lupolab/julia/tcia_analysis/datasets/...,8-AX_T1_pre_gd-14019,TCGA-DU-5853,/working/lupolab/julia/tcia_analysis/datasets/...
44,T1,/working/lupolab/julia/tcia_analysis/datasets/...,9-3D_DCE_T1_MAP-73753,TCGA-DU-5853,/working/lupolab/julia/tcia_analysis/datasets/...
46,T2,/working/lupolab/julia/tcia_analysis/datasets/...,7-AX_T2_FR-FSE_RF2_150-55295,TCGA-DU-5853,/working/lupolab/julia/tcia_analysis/datasets/...


In [17]:
usable_patient_df.image[39]

'/working/lupolab/julia/tcia_analysis/datasets/TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/12-COR__T1_POST_GD_FLAIR-32689.nii.gz'

In [18]:
'/'+'/'.join(usable_patient_df.image[39].split('/')[6:])

'/TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/12-COR__T1_POST_GD_FLAIR-32689.nii.gz'

In [19]:
root_dir = '/working/lupolab/julia/tcia_analysis'


In [52]:
def find_patient_images(patient_id, usable_patient_df): 
    images_dict = {}
    patient_images_df = usable_patient_df.loc[usable_patient_df.patient_id == patient_id]
    for x in ['T1C', 'T1', "T2", 'T2_FLAIR']: 
        contrast_images_df = patient_images_df.loc[patient_images_df.ground_truth == x].reset_index(drop = True)
        if contrast_images_df.shape[0] > 1: 
            for idx, row in contrast_images_df.iterrows(): 
                if 'ax'  in row['sd'].lower(): 
                    if row['aligned_image']: 
                        images_dict[x] =  '/'.join(row['aligned_image'].split('/')[6:])
                    else: 
                        images_dict[x] =  '/'.join(row['image'].split('/')[6:])
                    continue
                elif x not in images_dict.keys(): 
                    if row['aligned_image']: 
                        images_dict[x] =  '/'.join(row['aligned_image'].split('/')[6:])
                    else: 
                        images_dict[x] =  '/'.join(row['image'].split('/')[6:])
        else: 
            if contrast_images_df['aligned_image'][0]: 
                images_dict[x] = '/'.join(contrast_images_df['aligned_image'][0].split('/')[6:])
            else: 
                images_dict[x] = '/'.join(contrast_images_df['image'][0].split('/')[6:])
            
    return images_dict

In [53]:
find_patient_images('TCGA-DU-5853', usable_patient_df)

{'T1C': 'TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/11-AX_T1_POST_GD_FLAIR-81300a.nii.gz',
 'T1': 'TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/8-AX_T1_pre_gd-14019a.nii.gz',
 'T2': 'TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/7-AX_T2_FR-FSE_RF2_150-55295a.nii.gz',
 'T2_FLAIR': 'TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/6-AXIAL_FLAIR-84159a.nii.gz'}

In [54]:
validation_dict_list = []
for patient_id in usable_patient_df.patient_id.unique(): 
    
    
    validation_dict = {}
    
    images_dict = find_patient_images(patient_id, usable_patient_df)
    
    validation_dict['label'] = ''
    validation_dict['image'] = [images_dict['T1C'], images_dict['T1'], images_dict['T2'], images_dict["T2_FLAIR"]]
    
    validation_dict_list.append(validation_dict)

                        

In [55]:
validation_dict_list_dict = {'validation': validation_dict_list}

In [56]:
with open('config_datalist_julia.json', 'w') as fp:
    json.dump(validation_dict_list_dict, fp)

In [57]:
validation_dict_list_dict

{'validation': [{'label': '',
   'image': ['TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/11-AX_T1_POST_GD_FLAIR-81300a.nii.gz',
    'TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/8-AX_T1_pre_gd-14019a.nii.gz',
    'TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/7-AX_T2_FR-FSE_RF2_150-55295a.nii.gz',
    'TCGA-LGG-nifti/TCGA-DU-5853/08-23-1995-MRI_BRAIN_WWO_CONTRAST-78436/6-AXIAL_FLAIR-84159a.nii.gz']},
  {'label': '',
   'image': ['TCGA-LGG-nifti/TCGA-CS-6186/06-01-2000-MRI_BRAIN_per_R-97608/1001-T1_SE_POST-94212a.nii.gz',
    'TCGA-LGG-nifti/TCGA-CS-6186/06-01-2000-MRI_BRAIN_per_R-97608/301-T1_SAG_SE-41348a.nii.gz',
    'TCGA-LGG-nifti/TCGA-CS-6186/06-01-2000-MRI_BRAIN_per_R-97608/501-T2-WHOLE_BRAIN-32557a.nii.gz',
    'TCGA-LGG-nifti/TCGA-CS-6186/06-01-2000-MRI_BRAIN_per_R-97608/401-T2_AX_FLAIR-72445a.nii.gz']},
  {'label': '',
   'image': ['TCGA-LGG-nifti/TCGA-FG-A6J1/04-23-2004-NR_MRI_BRAIN_WWO-08141/20-T1fl2dAx__Gd-