Code to generate training data for Mask RCNN for brats

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import h5py
from sklearn.utils import shuffle
from tqdm import tqdm_notebook as tqdm
import nibabel as nib
import pandas as pd

  from ._conv import register_converters as _register_converters


Set PATHS

In [2]:
HGG_RAW_PATH      = '/media/brats/MyPassport/Avinash/Kaminstas_2018/MICCAI_BraTS_2018_Data_Training/HGG'
LGG_RAW_PATH      = '/media/brats/MyPassport/Avinash/Kaminstas_2018/MICCAI_BraTS_2018_Data_Training/LGG'

H5_PATH_TRAIN     = '/media/brats/MyPassport/Avinash/Kaminstas_2018/MICCAI_BraTS_2018_Data_Training/v_hdf5_data/train_set'
H5_PATH_VALID     = '/media/brats/MyPassport/Avinash/Kaminstas_2018/MICCAI_BraTS_2018_Data_Training/v_hdf5_data/valid_set'

csv_path = '/media/brats/MyPassport/Avinash/Kaminstas_2018/Modified_Kamnistas_Model_2018/DataGenerator/train_valid_test_split.csv'
info     = pd.read_csv(csv_path)

train_info = info[info['Training']].as_matrix()[:,1]
valid_info = info[info['Validation']].as_matrix()[:,1]

train_HGG  = np.array([path.split('/').pop() for path in train_info if path.__contains__('HGG')])
valid_HGG  = np.array([path.split('/').pop() for path in valid_info if path.__contains__('HGG')])

train_LGG  = np.array([path.split('/').pop() for path in train_info if path.__contains__('LGG')])
valid_LGG  = np.array([path.split('/').pop() for path in valid_info if path.__contains__('LGG')])

HGG_IDS    = np.concatenate([train_HGG, valid_HGG])
LGG_IDS    = np.concatenate([train_LGG, valid_LGG])

In [3]:
no_of_HGG_patients_to_train= len(train_HGG)
no_of_LGG_patients_to_train= len(train_LGG)
no_of_HGG_patients_to_valid= len(valid_HGG)
no_of_LGG_patients_to_valid= len(valid_LGG)

hgg_id_list = HGG_IDS
lgg_id_list = LGG_IDS

hgg_total_number= len(HGG_IDS)
lgg_total_number= len(LGG_IDS)
lgg_total_number

63

In [4]:
def scale_every_slice_between_0_to_255(a):
    normalized_a=  255*((a-np.min(a))/(np.max(a)-np.min(a)))
    return normalized_a
    

In [5]:
for i in tqdm(range(no_of_HGG_patients_to_train)):
#     print ('patient id',hgg_id_list[i])
    sequences= os.listdir(HGG_RAW_PATH+"/"+hgg_id_list[i])
    counter =0
    for s in sequences:
        if "flair" in s:
            flair = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "t2" in s:
            t2 = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "t1ce" in s:
            t1c = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "t1" in s and 't1ce' not in s:
            t1 = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "seg" in s:
            seg = HGG_RAW_PATH+'/'+hgg_id_list[i]+"/"+s
        if "mask" in s:
            mask =HGG_RAW_PATH+'/'+hgg_id_list[i]+"/"+s
            

    flair_v= nib.load(flair).get_data()
    t2_v=    nib.load(t2).get_data()
    t1c_v=   nib.load(t1c).get_data()
    t1_v=   nib.load(t1).get_data() 
    
    seg_v=   nib.load(seg).get_data()
    mask_v=  nib.load(mask).get_data()

    

    x,y,z = np.where (seg_v !=0)
    
    for slices in np.unique(z):
        
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1 = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices]))
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske= np.transpose(mask_v[:,:,slices])
        
        x,y  = np.where (sege!=0)
        if len(x) > 50:
            array=  np.zeros((fl.shape[0],fl.shape[1],3),dtype=fl.dtype)
            array[:,:,0]= fl
            array[:,:,1]= t2
            array[:,:,2]= t1ce
#             array[:,:,3]= t1


            counter = counter +1

            name_scheme= hgg_id_list[i] + '_'+ str(counter)
            dest_path = os.path.join(H5_PATH_TRAIN,name_scheme +'.hdf5')
#             print (dest_path)
            hp = h5py.File(dest_path,'w')
            hp.create_dataset('Sequence', data=array)       
            hp.create_dataset('label', data=sege)
            hp.create_dataset('mask',data=maske)
            hp.close()
        
    ## Added to take few slices without any lesion
    x,y,z = np.where (seg_v ==0)
    slices_without_lesion= np.unique(z)
    ## np.random.shuffle the above array
    slices_without_lesion = shuffle(slices_without_lesion, random_state=0)
    ## take half the number of slices without lesion
    num_of_no_instance_slices= len(slices_without_lesion)//3
    
    for sl in range(num_of_no_instance_slices):
        slices = slices_without_lesion[sl]
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1 = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices])) 
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske= np.transpose(mask_v [:,:,slices])
        
        array=  np.zeros((fl.shape[0],fl.shape[1],3),dtype=fl.dtype)
        array[:,:,0]= fl
        array[:,:,1]= t2
        array[:,:,2]= t1ce
#         array[:,:,3]= t1        
        counter = counter +1
        
        name_scheme= hgg_id_list[i] + '_'+ str(counter)
        dest_path = os.path.join(H5_PATH_TRAIN,name_scheme +'.hdf5')
#         print (dest_path)
        hp = h5py.File(dest_path,'w')
        hp.create_dataset('Sequence', data=array)       
        hp.create_dataset('label', data=sege)
        hp.create_dataset('mask',data=maske)
        hp.close()

  





In [6]:
for i in tqdm(range(no_of_HGG_patients_to_train,no_of_HGG_patients_to_train+no_of_HGG_patients_to_valid)):
#     print ('patient id',hgg_id_list[i])
    sequences= os.listdir(HGG_RAW_PATH+"/"+hgg_id_list[i])
    counter =0
    for s in sequences:
        if "flair" in s:
            flair = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "t2" in s:
            t2 = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "t1ce" in s:
            t1c = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "t1" in s and 't1ce' not in s:
            t1 = HGG_RAW_PATH+"/"+hgg_id_list[i]+"/"+s
        if "seg" in s:
            seg = HGG_RAW_PATH+'/'+hgg_id_list[i]+"/"+s
        if "mask" in s:
            mask =HGG_RAW_PATH+'/'+hgg_id_list[i]+"/"+s
            
    flair_v= nib.load(flair).get_data()
    t2_v =    nib.load(t2).get_data()
    t1c_v=   nib.load(t1c).get_data()
    t1_v =  nib.load(t1).get_data()
    
    seg_v=   nib.load(seg).get_data()
    mask_v=nib.load(mask).get_data()
    
 
    x,y,z = np.where (seg_v !=0)
    
    for slices in np.unique(z):
        
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1 = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices]))
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske= np.transpose(mask_v[:,:,slices])
        
        x,y  = np.where (sege!=0)
        if len(x) > 50:
            array=  np.float32(np.zeros((fl.shape[0],fl.shape[1],4)))
            array[:,:,0]= fl
            array[:,:,1]= t2
            array[:,:,2]= t1ce
            array[:,:,3]= t1
            
            counter = counter +1

            name_scheme= hgg_id_list[i] + '_'+ str(counter)
            dest_path = os.path.join(H5_PATH_VALID, name_scheme +'.hdf5')
            hp = h5py.File(dest_path,'w')
            hp.create_dataset('Sequence', data=array)       
            hp.create_dataset('label', data=sege)
            hp.create_dataset('mask', data=maske)   
            hp.close()

    ## Added to take few slices without any lesion
    x,y,z = np.where (seg_v ==0)
    slices_without_lesion= np.unique(z)
    ## shuffle the above array
    slices_without_lesion = shuffle(slices_without_lesion, random_state=0)
    ## take half the number of slices without lesion
    num_of_no_instance_slices= len(slices_without_lesion)//3
    
    for sl in range(num_of_no_instance_slices):
        slices = slices_without_lesion[sl]
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1 = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices]))   
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske= np.transpose(mask_v  [:,:,slices])
        array=  np.float32(np.zeros((fl.shape[0],fl.shape[1],4)))
        array[:,:,0]= fl
        array[:,:,1]= t2
        array[:,:,2]= t1ce
        array[:,:,3]= t1        
        
        counter = counter +1
        
        name_scheme= hgg_id_list[i] + '_'+ str(counter)
        dest_path = os.path.join(H5_PATH_VALID,name_scheme +'.hdf5')
        hp = h5py.File(dest_path,'w')
        hp.create_dataset('Sequence', data=array)       
        hp.create_dataset('label', data=sege)
        hp.create_dataset('mask', data=maske)
        hp.close()


  





In [7]:
for i in tqdm(range(no_of_LGG_patients_to_train)):
#     print ('patient id',lgg_id_list[i])
    sequences= os.listdir(LGG_RAW_PATH+"/"+lgg_id_list[i])
    counter =0
    for s in sequences:
        if "flair" in s:
            flair = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s
        if "t2" in s:
            t2 = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s
        if "t1ce" in s:
            t1c = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s
        if "t1" in s and "t1ce" not in s:
            t1 = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s            
        if "seg" in s:
            seg = LGG_RAW_PATH+'/'+lgg_id_list[i]+"/"+s
        if "mask" in s:
            mask= LGG_RAW_PATH+'/'+lgg_id_list[i]+"/"+s
        
    flair_v= nib.load(flair).get_data()
    t2_v=    nib.load(t2).get_data()
    t1c_v=   nib.load(t1c).get_data()
    t1_v=   nib.load(t1).get_data()
    
    seg_v=   nib.load(seg).get_data()
    mask_v=   nib.load(mask).get_data()    
    
    x,y,z = np.where (seg_v !=0)
    
    for slices in np.unique(z):
        
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1 = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices]))
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske = np.transpose(mask_v  [:,:,slices])       
        x,y  = np.where (sege!=0)
        if len(x) > 50:
            array=  np.zeros((fl.shape[0],fl.shape[1],4),dtype=fl.dtype)
            array[:,:,0]= fl
            array[:,:,1]= t2
            array[:,:,2]= t1ce
            array[:,:,3]= t1
            
            counter = counter +1

            name_scheme= lgg_id_list[i] + '_'+ str(counter)
            dest_path =os.path.join(H5_PATH_TRAIN,name_scheme +'.hdf5')
            hp = h5py.File(dest_path,'w')
            hp.create_dataset('Sequence', data=array)       
            hp.create_dataset('label', data=sege)
            hp.create_dataset('mask', data=maske)            
            hp.close()

    ## Added to take few slices without any lesion
    x,y,z = np.where (seg_v ==0)
    slices_without_lesion= np.unique(z)
    ## shuffle the above array
    slices_without_lesion = shuffle(slices_without_lesion, random_state=0)
    ## take half the number of slices without lesion
    num_of_no_instance_slices= len(slices_without_lesion)//3
    
    for sl in range(num_of_no_instance_slices):
        slices = slices_without_lesion[sl]
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1 = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices]))
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske = np.transpose(mask_v  [:,:,slices])
        
        array=  np.zeros((fl.shape[0],fl.shape[1],4),dtype=fl.dtype)
        array[:,:,0]= fl
        array[:,:,1]= t2
        array[:,:,2]= t1ce
        array[:,:,3]= t1
        
        counter = counter +1
        
        name_scheme= lgg_id_list[i] + '_'+ str(counter)
        dest_path = os.path.join(H5_PATH_TRAIN,name_scheme +'.hdf5')
        hp = h5py.File(dest_path,'w')
        hp.create_dataset('Sequence', data=array)       
        hp.create_dataset('label', data=sege)
        hp.create_dataset('mask', data=maske)
        hp.close()

  





In [None]:
for i in tqdm(range(no_of_LGG_patients_to_train,no_of_LGG_patients_to_train+no_of_LGG_patients_to_valid)):
#     print ('patiend id', lgg_id_list[i])
    sequences= os.listdir(LGG_RAW_PATH+"/"+lgg_id_list[i])
    counter =0
    for s in sequences:
        if "flair" in s:
            flair = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s
        if "t2" in s:
            t2 = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s
        if "t1ce" in s:
            t1c = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s
        if "t1" in s and "t1ce" not in s:
            t1 = LGG_RAW_PATH+"/"+lgg_id_list[i]+"/"+s 
        if "seg" in s:
            seg = LGG_RAW_PATH+'/'+lgg_id_list[i]+"/"+s
        if "mask" in s:
            mask = LGG_RAW_PATH+'/'+lgg_id_list[i]+"/"+s
        
    flair_v= nib.load(flair).get_data()
    t2_v=    nib.load(t2).get_data()
    t1c_v=   nib.load(t1c).get_data()
    t1_v=   nib.load(t1).get_data() 
    
    seg_v=   nib.load(seg).get_data()
    mask_v=   nib.load(mask).get_data()    
 
    x,y,z = np.where (seg_v !=0)
    
    for slices in np.unique(z):
        
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1   = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices]))  
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske = np.transpose(mask_v  [:,:,slices])
        
        x,y  = np.where (sege!=0)
        if len(x) > 50:
            array=  np.zeros((fl.shape[0],fl.shape[1],4),dtype=fl.dtype)
            array[:,:,0]= fl
            array[:,:,1]= t2
            array[:,:,2]= t1ce
            array[:,:,3]= t1
            
            counter = counter +1

            name_scheme= lgg_id_list[i] + '_'+ str(counter)
            dest_path = os.path.join(H5_PATH_VALID,name_scheme +'.hdf5')
            hp = h5py.File(dest_path,'w')
            hp.create_dataset('Sequence', data=array)       
            hp.create_dataset('label', data=sege)
            hp.create_dataset('mask', data=maske)
            hp.close()
        
    ## Added to take few slices without any lesion
    x,y,z = np.where (seg_v ==0)
    slices_without_lesion= np.unique(z)
    ## shuffle the above array
    slices_without_lesion = shuffle(slices_without_lesion, random_state=0)
    ## take half the number of slices without lesion
    num_of_no_instance_slices= len(slices_without_lesion)//3
    
    for sl in range(num_of_no_instance_slices):
        slices = slices_without_lesion[sl]
        fl   = scale_every_slice_between_0_to_255(np.transpose(flair_v[:,:,slices]))
        t2   = scale_every_slice_between_0_to_255(np.transpose(t2_v   [:,:,slices]))
        t1ce = scale_every_slice_between_0_to_255(np.transpose(t1c_v  [:,:,slices]))
        t1 = scale_every_slice_between_0_to_255(np.transpose(t1_v  [:,:,slices])) 
        
        sege = np.transpose(seg_v  [:,:,slices])
        maske = np.transpose(mask_v  [:,:,slices]) 
        
        array=  np.zeros((fl.shape[0],fl.shape[1],4),dtype=fl.dtype)
        array[:,:,0]= fl
        array[:,:,1]= t2
        array[:,:,2]= t1ce
        array[:,:,3]= t1 
        
        counter = counter +1
        
        name_scheme= lgg_id_list[i] + '_'+ str(counter)
        dest_path = os.path.join(H5_PATH_VALID,name_scheme +'.hdf5')
        hp = h5py.File(dest_path,'w')
        hp.create_dataset('Sequence', data=array)       
        hp.create_dataset('label', data=sege)
        hp.create_dataset('mask', data=maske)        
        hp.close()

  


In [None]:
patiend_id=[]
for i in tqdm(range(no_of_LGG_patients_to_train+no_of_LGG_patients_to_valid,lgg_total_number)):
    patiend_id.append(lgg_id_list[i])
for i in tqdm(range(no_of_HGG_patients_to_train+no_of_HGG_patients_to_valid,hgg_total_number)):
    patiend_id.append(hgg_id_list[i]) 

df= pd.DataFrame()
df['patient_id']= patiend_id
df.to_csv('Testing_patient_slices_test.csv')