In [None]:
%pip install pydicom
%pip install matplotlib
%pip install pandas
%pip install opencv-python

Dataset Description

- Modalities - MR
- Number of Participants - 346
- Number of Studies - 349
- Number of Series - 18,321
- Number of Images - 309,251
- Images Size (GB) - 15.1

Of this, we will only use the t2tsetra MR Images

In [3]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import re
import pydicom
import numpy as np

labels_train = pd.read_csv('./ProstateX-TrainingLesionInformationv2/ProstateX-Images-Train.csv')
findings_train = pd.read_csv('./ProstateX-TrainingLesionInformationv2/ProstateX-Findings-Train.csv')

labels_test = pd.read_csv('./ProstateX-TestLesionInformation/ProstateX-Images-Test.csv')
findings_test = pd.read_csv('./ProstateX-TestLesionInformation/ProstateX-Findings-Test.csv')

In [168]:
images_path = Path('../data/PROSTATEx-v1-doiJNLP/PROSTATEx/')

In [4]:
def load_dicom(label):
    image_path = next(next((images_path / label['ProxID']).glob('*/')).glob(f'{label['DCMSerNum']}.000000-{label['DCMSerDescr'].replace('_', '')}-*'))
    
    height, width, num_slices, _ = map(int, label['Dim'].split('x'))

    mri = np.empty((num_slices, width, height), np.uint16)

    for ix, dicom_file in enumerate(sorted(image_path.glob('*.dcm'))):
        ds = pydicom.dcmread(dicom_file).pixel_array
        mri[ix] = ds
    
    return mri


t2tsetra_labels = labels_train[labels_train['DCMSerDescr'] == 't2_tse_tra'].copy()

In [8]:
import matplotlib.patches as patches

In [5]:
resized_lesion_coordinates = (t2tsetra_labels['ijk'].str.split(expand=True))
resized_lesion_coordinates.columns = ['lesion_col', 'lesion_row', 'lesion_slice']

for col in resized_lesion_coordinates.columns:
    resized_lesion_coordinates[col] = pd.to_numeric(resized_lesion_coordinates[col])

image_dims = t2tsetra_labels['Dim'].str.split('x', expand=True).astype(float)
image_dims.columns = ['height', 'width', 'num_slices', 'depth']

for col in image_dims.columns:
    image_dims[col] = pd.to_numeric(image_dims[col])


resized_lesion_coordinates['lesion_col'] *= (320 / image_dims['width'])
resized_lesion_coordinates['lesion_row'] *= (320 / image_dims['height'])

In [9]:
import cv2

In [10]:
def load_dicom_slices(label, start_slice, num_slices):
    image_path = next(next((images_path / label['ProxID']).glob('*/')).glob(f'{label['DCMSerNum']}.000000-{label['DCMSerDescr'].replace('_', '')}-*'))
    
    height, width, _, _ = map(int, label['Dim'].split('x'))

    mri = np.empty((num_slices, width, height), np.uint16)

    slice_files = list(sorted(image_path.glob('*.dcm')))

    for ix, dicom_file in enumerate(slice_files[start_slice:start_slice + num_slices]):
        ds = pydicom.dcmread(dicom_file).pixel_array
        mri[ix] = ds
    
    return mri

In [11]:
def draw_bounds(r, c, bounding_box_size, ax=None):
    bounding_box = patches.Rectangle((c - bounding_box_size / 2, r - bounding_box_size / 2), bounding_box_size, bounding_box_size, linewidth=1, edgecolor='r', facecolor='none')
    
    if ax:
        ax.add_patch(bounding_box)
    else:
        plt.gca().add_patch(bounding_box)

In [12]:
t2tsetra_labels['lesion_crop_filename'] = [f'{i}_cropped.npy' for i in t2tsetra_labels.index]

In [13]:
def crop_center(img, r, c, size: int):
    sr = r - size / 2
    er = sr + size
    sc = c - size / 2
    ec = sc + size

    sr, sc, er, ec = map(int, [sr, sc, er, ec])
    
    if min(sr, sc, er, ec) < 0 or max(sr, sc, er, ec) > max(img.shape):
        raise Exception('Lesion crop out of bounds')
    
    return img[:, sr: er, sc: ec]

In [184]:
cropped_lesion_files_path = Path('cropped_lesions/')

for ix, row in t2tsetra_labels.iterrows():
    label = row

    lesion_slice = resized_lesion_coordinates['lesion_slice'].loc[ix]
    lesion_row = resized_lesion_coordinates['lesion_row'].loc[ix]
    lesion_col = resized_lesion_coordinates['lesion_col'].loc[ix]


    resized_mri = np.empty((5, 320, 320))
    
    mri = load_dicom_slices(label, lesion_slice - 2, 5)

    for ix, slice in enumerate(mri):
        resized_mri[ix] = cv2.resize(slice, (320, 320))
        
    # fig, axs = plt.subplots(1, 2)

    # axs[0].imshow(resized_mri[2])

    # draw_bounds(lesion_row, lesion_col, 64, axs[0])

    # cropped_lesion = crop_center(resized_mri, lesion_row, lesion_col, 64)

    # axs[1].imshow(cropped_lesion[2])

    with open(cropped_lesion_files_path / label['lesion_crop_filename'], 'wb+') as f:
        np.save(f, resized_mri)

print('Done')

Done


In [185]:
with open('labeled_cropped_lesions.csv', 'wb+') as f:
    t2tsetra_labels.to_csv(f)

In [14]:
findings = pd.read_csv('ProstateX-TrainingLesionInformationv2/ProstateX-Findings-Train.csv')

In [190]:
t2tsetra_labels = t2tsetra_labels.drop_duplicates(subset=['ProxID', 'fid', 'pos'], keep='first')

In [194]:
joined_labels = findings.merge(t2tsetra_labels, on=['ProxID', 'fid', 'pos'], how='left')

In [196]:
joined_labels.to_csv('final_training_labels.csv')