## Imports

In [33]:
import os
import pandas as pd
import cv2
import numpy as np
from pydicom import dcmread
from pathlib import Path

## Read Data

In [5]:
# Read metadata files
train_boxes = pd.read_csv('boxes.csv', delimiter=';')
files_path_train = pd.read_csv('paths.csv', delimiter=';')
labels = pd.read_csv('labels.csv', delimiter=';')

In [3]:
# Filter images by desired patients (DBT-P00013, DBT-P00023, DBT-P00029, DBT-P00107)
desired_images = ['DBT-P00013','DBT-P00024','DBT-P00060','DBT-P00225','DBT-P00107','DBT-P00194','DBT-P00303','DBT-P00538','DBT-P00029','DBT-P00032','DBT-P00045','DBT-P00052','DBT-P00023','DBT-P00161','DBT-P00183','DBT-P00259']
train_boxes = train_boxes[train_boxes['PatientID'].isin(desired_images)]
files_path_train = files_path_train[files_path_train['PatientID'].isin(desired_images)]
labels = labels[labels['PatientID'].isin(desired_images)]

In [4]:
train_boxes

Unnamed: 0,PatientID,StudyUID,View,Subject,Slice,X,Y,Width,Height,Class,AD,VolumeSlices
0,DBT-P00013,DBT-S00163,rmlo,0,16,1116,1724,218,105,benign,0,49
1,DBT-P00024,DBT-S03255,lcc,0,19,267,488,64,64,benign,0,73
2,DBT-P00024,DBT-S03255,lcc,0,37,169,384,93,85,benign,0,73
3,DBT-P00024,DBT-S03255,lmlo,0,11,471,1060,67,56,benign,0,81
4,DBT-P00024,DBT-S03255,lmlo,0,12,640,673,93,80,benign,0,81
5,DBT-P00060,DBT-S00787,rcc,0,21,1276,672,228,219,benign,1,65
6,DBT-P00107,DBT-S05365,lcc,0,34,0,1148,205,449,cancer,0,66
7,DBT-P00107,DBT-S05365,lmlo,0,45,13,729,629,665,cancer,0,73
8,DBT-P00194,DBT-S00645,rmlo,0,35,1347,1730,268,255,cancer,0,80
9,DBT-P00225,DBT-S02346,lcc,0,54,266,1180,323,254,benign,1,86


## Create Masks For Malign Images

In [5]:
# Create folder folder for malign masks
MALIGN_FOLDER='malign_images_masks'
os.mkdir(MALIGN_FOLDER)

In [6]:
# Create a dataframe only with cancer classes
malign_train_boxes = train_boxes.copy(deep=True)
malign_train_boxes = malign_train_boxes[malign_train_boxes['Class'] == 'cancer']

In [12]:
# Filter by what we want [1,2,5,9]
malign_train_boxes = malign_train_boxes.iloc[[1,2,5,9]]
malign_train_boxes

In [17]:
# Iterate over the train boxes to generate the mask
for index, elem in malign_train_boxes.iterrows():
    
    # Find the slice and read the image
    c_image = [i for i in os.listdir(os.path.join('..', str(elem['PatientID']))) if str(elem['Slice']) in i][0]
    current = cv2.imread(os.path.join('..', str(elem['PatientID']), c_image))
    
    # Create folder for the current mask
    c_path = os.path.join(MALIGN_FOLDER, str(elem['PatientID']) + '-' + elem['View'])
    os.mkdir(c_path)
    
    # Create mask
    mask = np.zeros((current.shape[0], current.shape[1]))
    segmented_mask = cv2.rectangle(mask, (elem['X'], elem['Y']), (elem['X'] + elem['Width'], elem['Y'] + elem['Height']), (255), -1)
                          
    # Save mask
    cv2.imwrite(os.path.join(c_path, "image.png"), current)
    cv2.imwrite(os.path.join(c_path, "mask.png"), segmented_mask)

## Create Masks For Bening Images

In [18]:
# Create folder folder for malign masks
BENIGN_FOLDER='benign_images_masks'
os.mkdir(BENIGN_FOLDER)

In [19]:
# Create a dataframe only with cancer classes
benign_train_boxes = train_boxes.copy(deep=True)
benign_train_boxes = benign_train_boxes[benign_train_boxes['Class'] == 'benign']

In [21]:
# Filter by what we want [0,3,5,6]
benign_train_boxes = benign_train_boxes.iloc[[0,3,5,6]]
benign_train_boxes

Unnamed: 0,PatientID,StudyUID,View,Subject,Slice,X,Y,Width,Height,Class,AD,VolumeSlices
0,DBT-P00013,DBT-S00163,rmlo,0,16,1116,1724,218,105,benign,0,49
3,DBT-P00024,DBT-S03255,lmlo,0,11,471,1060,67,56,benign,0,81
5,DBT-P00060,DBT-S00787,rcc,0,21,1276,672,228,219,benign,1,65
9,DBT-P00225,DBT-S02346,lcc,0,54,266,1180,323,254,benign,1,86


In [22]:
# Iterate over the train boxes to generate the mask
for index, elem in benign_train_boxes.iterrows():
    
    # Find the slice and read the image
    c_image = [i for i in os.listdir(os.path.join('..', str(elem['PatientID']))) if str(elem['Slice']) in i][0]
    current = cv2.imread(os.path.join('..', str(elem['PatientID']), c_image))
    
    # Create folder for the current mask
    c_path = os.path.join(BENIGN_FOLDER, str(elem['PatientID']) + '-' + elem['View'])
    os.mkdir(c_path)
    
    # Create mask
    mask = np.zeros((current.shape[0], current.shape[1]))
    segmented_mask = cv2.rectangle(mask, (elem['X'], elem['Y']), (elem['X'] + elem['Width'], elem['Y'] + elem['Height']), (255), -1)
                      
    # Save mask
    cv2.imwrite(os.path.join(c_path, "image.png"), current)
    cv2.imwrite(os.path.join(c_path, "mask.png"), segmented_mask)

## Deal With DICOM Files Directly

In [45]:
# Read metadata files
train_boxes = pd.read_csv('boxes.csv', delimiter=';')
files_path_train = pd.read_csv('paths.csv', delimiter=';')
labels = pd.read_csv('labels.csv', delimiter=';')
meta = pd.read_csv('metadata.csv')

In [4]:
# Folder for cancer images
MALIGN_FOLDER='malign_images'

# Folder for benign images
BENIGN_FOLDER='benign_images'

# Folder for actionable images
ACTIONABLE_FOLDER='actionable_images'

# Folder for normal images
NORMAL_FOLDER='normal_images'

# Create folder for maskless images
MASKLESS_IMAGES = 'maskless_images'
os.mkdir(MASKLESS_IMAGES)
os.mkdir(os.path.join(MASKLESS_IMAGES, MALIGN_FOLDER))
os.mkdir(os.path.join(MASKLESS_IMAGES, BENIGN_FOLDER))
os.mkdir(os.path.join(MASKLESS_IMAGES, ACTIONABLE_FOLDER))
os.mkdir(os.path.join(MASKLESS_IMAGES, NORMAL_FOLDER))


# Create folder for segmentations
MASK_IMAGES = 'mask_images'
os.mkdir(MASK_IMAGES)
os.mkdir(os.path.join(MASK_IMAGES, MALIGN_FOLDER))
os.mkdir(os.path.join(MASK_IMAGES, BENIGN_FOLDER))
os.mkdir(os.path.join(MASK_IMAGES, ACTIONABLE_FOLDER))
os.mkdir(os.path.join(MASK_IMAGES, NORMAL_FOLDER))

In [5]:
aux = pd.merge(left=files_path_train, right=train_boxes, how='outer', on=['PatientID', 'StudyUID', 'View'])
aux.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path,Subject,Slice,X,Y,Width,Height,Class,AD,VolumeSlices
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,0.0,16.0,1116.0,1724.0,218.0,105.0,benign,0.0,49.0
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,


In [6]:
all_data = pd.merge(left=aux, right=labels, how='outer', on=['PatientID', 'StudyUID', 'View'])
all_data.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path,Subject,Slice,X,Y,Width,Height,Class,AD,VolumeSlices,Normal,Actionable,Benign,Cancer
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,0.0,16.0,1116.0,1724.0,218.0,105.0,benign,0.0,49.0,0,0,1,0
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0


In [61]:
# Create column for series UID
all_data['Series UID'] = all_data['classic_path'].apply(lambda x: x.split('/')[-2])
all_data.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path,Subject,Slice,X,Y,Width,Height,Class,AD,VolumeSlices,Normal,Actionable,Benign,Cancer,Series UID
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,0.0,16.0,1116.0,1724.0,218.0,105.0,benign,0.0,49.0,0,0,1,0,1.2.826.0.1.3680043.8.498.97979602815077649368...
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0,1.2.826.0.1.3680043.8.498.12136582480949936067...
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0,1.2.826.0.1.3680043.8.498.90045035130681803298...
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0,1.2.826.0.1.3680043.8.498.10822555886306795549...
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,,,,,0,1,0,0,1.2.826.0.1.3680043.8.498.29938515490857039234...


In [66]:
all_data = all_data.rename(columns={'PatientID': 'Subject ID'})
all_data = pd.merge(left=all_data, right=meta, how='outer', on=['Subject ID', 'Series UID'])
all_data.head()

Unnamed: 0,Subject ID,StudyUID,View,descriptive_path,classic_path,Subject,Slice,X,Y,Width,...,Study Date,Series Description,Manufacturer,Modality,SOP Class Name,SOP Class UID,Number of Images,File Size,File Location,Download Timestamp
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,0.0,16.0,1116.0,1724.0,218.0,...,01-01-2000,,HOLOGIC Inc.,MG,Breast Tomosynthesis Image Storage,1.2.840.10008.5.1.4.1.1.13.1.3,1.0,51.67 MB,./Breast-Cancer-Screening-DBT/DBT-P00013/01-01...,2021-12-28T16:22:34.158
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,...,01-01-2000,,HOLOGIC Inc.,MG,Breast Tomosynthesis Image Storage,1.2.840.10008.5.1.4.1.1.13.1.3,1.0,72.68 MB,./Breast-Cancer-Screening-DBT/DBT-P00023/01-01...,2021-12-28T16:22:36.875
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,...,01-01-2000,,HOLOGIC Inc.,MG,Breast Tomosynthesis Image Storage,1.2.840.10008.5.1.4.1.1.13.1.3,1.0,109.66 MB,./Breast-Cancer-Screening-DBT/DBT-P00023/01-01...,2021-12-28T16:22:46.611
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,...,01-01-2000,,HOLOGIC Inc.,MG,Breast Tomosynthesis Image Storage,1.2.840.10008.5.1.4.1.1.13.1.3,1.0,55.58 MB,./Breast-Cancer-Screening-DBT/DBT-P00023/01-01...,2021-12-28T16:22:35.993
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,,,,,,...,01-01-2000,,HOLOGIC Inc.,MG,Breast Tomosynthesis Image Storage,1.2.840.10008.5.1.4.1.1.13.1.3,1.0,101.12 MB,./Breast-Cancer-Screening-DBT/DBT-P00023/01-01...,2021-12-28T16:23:05.802


In [72]:
# Iterate over the train boxes to generate the mask
for index, elem in all_data.iterrows():

    # Read and prepare image
    ds = dcmread(elem['File Location'] + '/1-1.dcm')
    arr = ds.pixel_array

    # Grabs path where to insert image and possible mask
    if np.isnan(elem['X']):
        path = MASKLESS_IMAGES
    else:
        path = MASK_IMAGES

    if elem['Normal'] == 1:
        path = os.path.join(path, NORMAL_FOLDER)
    elif elem['Actionable'] == 1:
        path = os.path.join(path, ACTIONABLE_FOLDER)
    elif elem['Benign'] == 1:
        path = os.path.join(path, BENIGN_FOLDER)
    else:
        path = os.path.join(path, MALIGN_FOLDER)

    if not np.isnan(elem['X']):

        # Read slice pretended
        c_image = arr[int(elem['Slice'])-1]*255
        image_save_path = os.path.join(path, elem['Subject ID'] + '_' + elem['View'] + '.png')
        cv2.imwrite(image_save_path, c_image)

        # Create mask
        mask = np.zeros((c_image.shape[0], c_image.shape[1]))
        segmented_mask = cv2.rectangle(mask, (int(elem['X']), int(elem['Y'])), (int(elem['X']) + int(elem['Width']), int(elem['Y']) + int(elem['Height'])), (255), -1)
                                
        # Save mask
        mask_save_path = os.path.join(path, elem['Subject ID'] + '_' + elem['View'] + '_mask.png')
        cv2.imwrite(mask_save_path, segmented_mask)
    
    else:
        for i in range(arr.shape[0]):
            c_image = arr[i]*255
            image_save_path = os.path.join(path, elem['Subject ID'] + '_' + elem['View'] + '_' + str(i) + '.png')
            cv2.imwrite(image_save_path, c_image)
