In [9]:
%pip install pydicom
%pip install matplotlib
%pip install pandas
%pip install opencv-python

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Dataset Description

- Modalities - MR
- Number of Participants - 346
- Number of Studies - 349
- Number of Series - 18,321
- Number of Images - 309,251
- Images Size (GB) - 15.1

Of this, we will only use the t2tsetra MR Images

In [10]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import re
import pydicom
import numpy as np

import matplotlib.patches as patches
import cv2

In [11]:
def join_resize_labels(img_labels, findings_labels):
    t2tsetra_labels = img_labels[img_labels['DCMSerDescr'] == 't2_tse_tra'].copy()

    resized_lesion_coordinates = (t2tsetra_labels['ijk'].str.split(expand=True))
    resized_lesion_coordinates.columns = ['resized_lesion_col', 'resized_lesion_row', 'resized_lesion_slice']

    for col in resized_lesion_coordinates.columns:
        resized_lesion_coordinates[col] = pd.to_numeric(resized_lesion_coordinates[col])

    image_dims = t2tsetra_labels['Dim'].str.split('x', expand=True).astype(float)
    image_dims.columns = ['height', 'width', 'num_slices', 'depth']

    for col in image_dims.columns:
        image_dims[col] = pd.to_numeric(image_dims[col])


    resized_lesion_coordinates['resized_lesion_col'] *= (320 / image_dims['width'])
    resized_lesion_coordinates['resized_lesion_row'] *= (320 / image_dims['height'])
    resized_lesion_coordinates['resized_lesion_slice'] = 2

    t2tsetra_labels = pd.concat([t2tsetra_labels, resized_lesion_coordinates], axis=1)


    t2tsetra_labels['lesion_crop_filename'] = [f'{patient}_{i}_cropped.npy' for i, patient in zip(t2tsetra_labels.index, t2tsetra_labels['ProxID'])]

    t2tsetra_labels = t2tsetra_labels.drop_duplicates(subset=['ProxID', 'fid', 'pos'], keep='first')

    joined_labels = findings_labels.merge(t2tsetra_labels, on=['ProxID', 'fid', 'pos'], how='inner')

    return joined_labels

In [12]:
def load_dicom_slices(label, start_slice, num_slices, image_dataset_path):
    file_pattern = f"{label['DCMSerNum']}.000000-{label['DCMSerDescr'].replace('_', '')}-*"
    image_path = next(next((image_dataset_path / label['ProxID']).glob('*/')).glob(file_pattern))
    
    height, width, _, _ = map(int, label['Dim'].split('x'))

    mri = np.empty((num_slices, width, height), np.uint16)

    slice_files = list(sorted(image_path.glob('*.dcm')))

    for ix, dicom_file in enumerate(slice_files[start_slice:start_slice + num_slices]):
        ds = pydicom.dcmread(dicom_file).pixel_array
        mri[ix] = ds
    
    return mri

def crop_center(img, r, c, size: int):
    sr = r - size / 2
    er = sr + size
    sc = c - size / 2
    ec = sc + size

    sr, sc, er, ec = map(int, [sr, sc, er, ec])
    
    if min(sr, sc, er, ec) < 0 or max(sr, sc, er, ec) > max(img.shape):
        raise Exception('Lesion crop out of bounds')
    
    return img[:, sr: er, sc: ec]

In [13]:
def load_dicom(label, image_dataset_path):
    image_path = next(next((image_dataset_path / label['ProxID']).glob('*/')).glob(f'{label['DCMSerNum']}.000000-{label['DCMSerDescr'].replace('_', '')}-*'))
    
    height, width, num_slices, _ = map(int, label['Dim'].split('x'))

    mri = np.empty((num_slices, width, height), np.uint16)

    for ix, dicom_file in enumerate(sorted(image_path.glob('*.dcm'))):
        ds = pydicom.dcmread(dicom_file).pixel_array
        mri[ix] = ds
    
    return mri

def draw_bounds(r, c, bounding_box_size, ax=None):
    bounding_box = patches.Rectangle((c - bounding_box_size / 2, r - bounding_box_size / 2), bounding_box_size, bounding_box_size, linewidth=1, edgecolor='r', facecolor='none')
    
    if ax:
        ax.add_patch(bounding_box)
    else:
        plt.gca().add_patch(bounding_box)

In [14]:
def save_cropped_lesions(image_dataset_path, labels, dst_path):

    for ix, row in labels.iterrows():
        label = row

        # lesion_slice = resized_lesion_coordinates['lesion_slice'].loc[ix]
        # lesion_row = resized_lesion_coordinates['lesion_row'].loc[ix]
        # lesion_col = resized_lesion_coordinates['lesion_col'].loc[ix]

        lesion_slice = label['resized_lesion_slice']
        lesion_row = label['resized_lesion_row']
        lesion_col = label['resized_lesion_col']

        resized_mri = np.empty((5, 320, 320))
        
        mri = load_dicom_slices(label, lesion_slice - 2, 5, image_dataset_path)

        for ix, slice in enumerate(mri):
            resized_mri[ix] = cv2.resize(slice, (320, 320))
            
        cropped_lesion = crop_center(resized_mri, lesion_row, lesion_col, 64)

        # fig, axs = plt.subplots(1, 2)

        # axs[0].imshow(resized_mri[2])

        # draw_bounds(lesion_row, lesion_col, 64, axs[0])

        # axs[1].imshow(cropped_lesion[2])

        with open(dst_path / label['lesion_crop_filename'], 'wb+') as f:
            np.save(f, cropped_lesion)

    print('Done')

In [15]:
image_dataset_path = Path('../data/PROSTATEx-v1-doiJNLP/PROSTATEx/')

img_labels_train = pd.read_csv('./ProstateX-TrainingLesionInformationv2/ProstateX-Images-Train.csv')
findings_labels_train = pd.read_csv('ProstateX-TrainingLesionInformationv2/ProstateX-Findings-Train.csv')

train_labels = join_resize_labels(img_labels_train, findings_labels_train)

In [16]:
train_labels

Unnamed: 0,ProxID,fid,pos,zone,ClinSig,Name,WorldMatrix,ijk,TopLevel,SpacingBetweenSlices,VoxelSpacing,Dim,DCMSerDescr,DCMSerNum,resized_lesion_col,resized_lesion_row,resized_lesion_slice,lesion_crop_filename
0,ProstateX-0000,1,25.7457 31.8707 -38.511,PZ,True,t2_tse_tra0,"0.5,1.00168e-010,0.00377059,-57.9373,-0.000199...",167 224 9,1.0,3.0,"0.5,0.5,3",384x384x19x1,t2_tse_tra,4,139.166667,186.666667,2,ProstateX-0000_7_cropped.npy
1,ProstateX-0001,1,-40.5367071921656 29.320722668457 -16.70766907...,AS,False,t2_tse_tra0,"0.5,1.02552e-010,-4.64581e-017,-119.205,-1.019...",157 186 10,1.0,3.0,"0.5,0.5,3",384x384x19x1,t2_tse_tra,6,130.833333,155.000000,2,ProstateX-0001_19_cropped.npy
2,ProstateX-0002,1,-27.0102 41.5467 -26.0469,PZ,True,t2_tse_tra0,"0.5,1.02552e-010,7.98512e-017,-103.784,-9.9634...",154 217 12,1.0,3.0,"0.5,0.5,3",384x384x19x1,t2_tse_tra,4,128.333333,180.833333,2,ProstateX-0002_41_cropped.npy
3,ProstateX-0002,2,-2.058 38.6752 -34.6104,PZ,False,t2_tse_tra0,"0.5,1.02552e-010,7.98512e-017,-103.784,-9.9634...",203 216 10,1.0,3.0,"0.5,0.5,3",384x384x19x1,t2_tse_tra,4,169.166667,180.000000,2,ProstateX-0002_40_cropped.npy
4,ProstateX-0003,1,22.1495 31.2717 -2.45933,TZ,False,t2_tse_tra0,"0.5,1.02552e-010,3.21757e-017,-98.0005,-8.5615...",240 205 12,1.0,3.0,"0.5,0.5,3",384x384x21x1,t2_tse_tra,3,200.000000,170.833333,2,ProstateX-0003_60_cropped.npy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,ProstateX-0201,1,10.1826 -10.0427 20.9151,AS,True,t2_tse_tra0,"0.56178,-0.0284585,0,-92.6812,0.0284585,0.5617...",191 163 16,,3.0,"0.5625,0.5625,3",320x320x23x1,t2_tse_tra,4,191.000000,163.000000,2,ProstateX-0201_3654_cropped.npy
325,ProstateX-0202,1,-0.02085 -44.5506 16.7349,AS,True,t2_tse_tra0,"0.5625,1.15371e-010,-1.51273e-017,-94.1693,-1....",167 148 10,,3.0,"0.5625,0.5625,3",320x320x21x1,t2_tse_tra,4,167.000000,148.000000,2,ProstateX-0202_3706_cropped.npy
326,ProstateX-0202,2,-12.8649 -21.7307 7.76273,PZ,False,t2_tse_tra0,"0.5625,1.15371e-010,-1.51273e-017,-94.1693,-1....",145 190 9,,3.0,"0.5625,0.5625,3",320x320x21x1,t2_tse_tra,4,145.000000,190.000000,2,ProstateX-0202_3705_cropped.npy
327,ProstateX-0203,1,-10.02681 -18.5905 -75.1691,AS,True,t2_tse_tra0,"0.5625,1.15371e-010,-7.55701e-017,-91.2778,-1....",144 132 11,,3.0,"0.5625,0.5625,3",320x320x19x1,t2_tse_tra,4,144.000000,132.000000,2,ProstateX-0203_3795_cropped.npy


In [18]:
img_labels_test = pd.read_csv('./ProstateX-TestLesionInformation/ProstateX-Images-Test.csv')
findings_labels_test = pd.read_csv('./ProstateX-TestLesionInformation/ProstateX-Findings-Test.csv')

test_labels = join_resize_labels(img_labels_test, findings_labels_test)

In [20]:
labels = pd.concat([train_labels, test_labels], axis=0)

In [24]:
labels['lesion_crop_filename'].duplicated().sum()

0

In [25]:
cropped_lesions_save_path = Path('cropped_lesions/')

labels.to_csv('cropped_lesion_labels.csv')
save_cropped_lesions(image_dataset_path, labels, cropped_lesions_save_path)

Done
