## Imports

In [None]:
import os
import pandas as pd
import cv2
import numpy as np
from pydicom import dcmread
from skimage.exposure import rescale_intensity
import shutil
import random

In [None]:
# Weights the values of the image edges and return the one that has the biggest value
# Required to flip images when necessary
def _get_image_laterality(image):
    left_edge = np.sum(image[:, 0])  
    right_edge = np.sum(image[:, -1])
    return "R" if left_edge < right_edge else "L"

# get image window center
def _get_window_center(ds):
    return np.float32(ds[0x5200, 0x9229][0][0x0028, 0x9132][0][0x0028, 0x1050].value)

# Get imahe window width
def _get_window_width(ds):
    return np.float32(ds[0x5200, 0x9229][0][0x0028, 0x9132][0][0x0028, 0x1051].value)

# Save image to rgb grayscale for 3 channels
def saveToRGBGrayScale(image, output_name):
    backtorgb = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    cv2.imwrite(output_name, backtorgb)

def get_views_images(images):
    lmlo = [im for im in images if "lmlo" in im]
    rmlo = [im for im in images if "rmlo" in im]
    lcc = [im for im in images if "lcc" in im]
    rcc = [im for im in images if "rcc" in im]
    return lmlo, rmlo, lcc, rcc

def get_individuals_per_view_type(lmlos, rmlos, lccs, rccs):
    lmlo_individuals = []
    rmlo_individuals = []
    lcc_individuals = []
    rcc_individuals = []

    for elem in lmlos:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in lmlo_individuals:
            lmlo_individuals.append(curr)

    for elem in rmlos:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in rmlo_individuals:
            rmlo_individuals.append(curr)

    for elem in lccs:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in lcc_individuals:
            lcc_individuals.append(curr)

    for elem in rccs:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in rcc_individuals:
            rcc_individuals.append(curr)

    return lmlo_individuals, rmlo_individuals, lcc_individuals, rcc_individuals


def get_individuals(images):
    individuals = []

    for elem in images:
        curr = elem.split("_")[0]
        if curr not in individuals:
            individuals.append(curr)

    return individuals

## Read Metadata

In [None]:
# Read metadata files
train_boxes = pd.read_csv('boxes.csv', delimiter=';')
files_path_train = pd.read_csv('paths.csv', delimiter=';')
labels = pd.read_csv('labels.csv', delimiter=';')
meta = pd.read_csv('metadata.csv')

In [None]:
# Folder for cancer images
MALIGN_FOLDER='CN_malign_images'

# Folder for benign images
BENIGN_FOLDER='CN_benign_images'

# Folder for actionable images
ACTIONABLE_FOLDER='CN_actionable_images'

# Folder for normal images
NORMAL_FOLDER='CN_normal_images'

# Create folder for maskless images
MASKLESS_IMAGES = 'CN_images'
os.mkdir(MASKLESS_IMAGES)
os.mkdir(os.path.join(MASKLESS_IMAGES, MALIGN_FOLDER))
os.mkdir(os.path.join(MASKLESS_IMAGES, BENIGN_FOLDER))
os.mkdir(os.path.join(MASKLESS_IMAGES, ACTIONABLE_FOLDER))
os.mkdir(os.path.join(MASKLESS_IMAGES, NORMAL_FOLDER))

In [None]:
aux = pd.merge(left=files_path_train, right=train_boxes, how='outer', on=['PatientID', 'StudyUID', 'View'])
aux.head()

In [None]:
all_data = pd.merge(left=aux, right=labels, how='outer', on=['PatientID', 'StudyUID', 'View'])
all_data.head()

In [None]:
# Create column for series UID
all_data['Series UID'] = all_data['classic_path'].apply(lambda x: x.split('/')[-2])
all_data.head()

In [None]:
all_data = all_data.rename(columns={'PatientID': 'Subject ID'})
all_data = pd.merge(left=all_data, right=meta, how='outer', on=['Subject ID', 'Series UID'])
all_data.head()

## Construct Masks

In [None]:
# Remove all rows for which we don't the dicom file location
all_data = all_data.dropna(subset=['File Location'])
all_data.head()

In [None]:
# Iterate over the train boxes to generate the mask
for index, elem in all_data.iterrows():

    # Read and prepare image
    ds = dcmread(elem['File Location'] + '/1-1.dcm')
    arr = ds.pixel_array

    window_center = _get_window_center(ds)
    window_width = _get_window_width(ds)
    low = (2 * window_center - window_width) / 2
    high = (2 * window_center + window_width) / 2
    arr = rescale_intensity(
        arr, in_range=(low, high), out_range="dtype"
    )

    # Grabs path where to insert image and possible mask
    path = MASKLESS_IMAGES

    if elem['Normal'] == 1:
        path = os.path.join(path, NORMAL_FOLDER)
    elif elem['Actionable'] == 1:
        path = os.path.join(path, ACTIONABLE_FOLDER)
    elif elem['Benign'] == 1:
        path = os.path.join(path, BENIGN_FOLDER)
    else:
        path = os.path.join(path, MALIGN_FOLDER)


    for i in range(arr.shape[0]):
        #c_image = arr[i]*255
        c_image = arr[i]
        image_save_path = os.path.join(path, elem['Subject ID'] + '_' + elem['View'] + '_' + str(elem['Series UID']) + '_' + str(i) + '.png')
        #cv2.imwrite(image_save_path, c_image)
        saveToRGBGrayScale(image=c_image, output_name=image_save_path)


# Split Dataset for Training and Test

In [None]:
def extract_given(train_samples, images, individuals, train_folder, test_folder, root_folder):
    for individual in individuals:
        curr_n_samples = int(train_samples/len(individuals))
        curr_target = [elem for elem in images if individual in elem]
        random.shuffle(curr_target)
        train_one = curr_target[:curr_n_samples]
        test_one = curr_target[curr_n_samples:]
        for to in train_one:
            shutil.copy(os.path.join(root_folder, to), os.path.join(train_folder, to))
        for tto in test_one:
            shutil.copy(os.path.join(root_folder, tto), os.path.join(test_folder, tto))

In [None]:
def split_dataset(train_percentage, base_path):
    ##########################################
    # GET THE IMAGES INFO
    ##########################################
    normal_images = [im for im in os.listdir(os.path.join(base_path, "CN_normal_images")) if "_mask" not in im if not im.startswith(".")]
    lmlo_normal, rmlo_normal, lcc_normal, rcc_normal = get_views_images(normal_images)
    lmlo_normal_individuals, rmlo_normal_individuals, lcc_normal_individuals, rcc_normal_individuals = get_individuals_per_view_type(lmlo_normal, rmlo_normal, lcc_normal, rcc_normal)

    malign_images = [im for im in os.listdir(os.path.join(base_path, "CN_malign_images")) if "_mask" not in im if not im.startswith(".")]
    lmlo_malign, rmlo_malign, lcc_malign, rcc_malign = get_views_images(malign_images)
    lmlo_malign_individuals, rmlo_malign_individuals, lcc_malign_individuals, rcc_malign_individuals = get_individuals_per_view_type(lmlo_malign, rmlo_malign, lcc_malign, rcc_malign)

    # Benign
    benign_images = [im for im in os.listdir(os.path.join(base_path, "CN_benign_images")) if "_mask" not in im if not im.startswith(".")]
    lmlo_benign, rmlo_benign, lcc_benign, rcc_benign = get_views_images(benign_images)
    lmlo_benign_individuals, rmlo_benign_individuals, lcc_benign_individuals, rcc_benign_individuals = get_individuals_per_view_type(lmlo_benign, rmlo_benign, lcc_benign, rcc_benign)

    ##########################################
    # CALCULATES PERCENTAGES
    ##########################################
    # Get samples and percentage of general
    total_samples = len(benign_images) + len(malign_images) + len(normal_images)
    benign_percent = int((len(benign_images)/total_samples)*100)
    malign_percent = int((len(malign_images)/total_samples)*100)
    normal_percent = int((len(normal_images)/total_samples)*100)

    # Get percentages for each view for image type
    # Benign
    rcc_benign_percentage = int((len(rcc_benign)/len(benign_images))*100)
    lcc_benign_percentage = int((len(lcc_benign)/len(benign_images))*100)
    rmlo_benign_percentage = int((len(rmlo_benign)/len(benign_images))*100)
    lmlo_benign_percentage = int((len(lmlo_benign)/len(benign_images))*100)

    # Malign
    rcc_malign_percentage = int((len(rcc_malign)/len(malign_images))*100)
    lcc_malign_percentage = int((len(lcc_malign)/len(malign_images))*100)
    rmlo_malign_percentage = int((len(rmlo_malign)/len(malign_images))*100)
    lmlo_malign_percentage = int((len(lmlo_malign)/len(malign_images))*100)

    # Normal
    rcc_normal_percentage = int((len(rcc_normal)/len(normal_images))*100)
    lcc_normal_percentage = int((len(lcc_normal)/len(normal_images))*100)
    rmlo_normal_percentage = int((len(rmlo_normal)/len(normal_images))*100)
    lmlo_normal_percentage = int((len(lmlo_normal)/len(normal_images))*100)


    ##########################################
    # CREATES FOLDERS
    ##########################################
    base_folder = "CN_data"
    train_folder = "train"
    test_folder = "test"
    malign_folder = "malign"
    benign_folder = "benign"
    normal_folder = "normal"

    os.mkdir(base_folder)
    os.mkdir(os.path.join(base_folder, train_folder))
    os.mkdir(os.path.join(base_folder, test_folder))

    os.mkdir(os.path.join(base_folder, train_folder, malign_folder))
    os.mkdir(os.path.join(base_folder, train_folder, benign_folder))
    os.mkdir(os.path.join(base_folder, train_folder, normal_folder))

    os.mkdir(os.path.join(base_folder, test_folder, malign_folder))
    os.mkdir(os.path.join(base_folder, test_folder, benign_folder))
    os.mkdir(os.path.join(base_folder, test_folder, normal_folder))


    ##########################################
    # SPLIT SAMPLES ACCORDING TO RATE
    ##########################################
    train_samples = int(train_percentage*total_samples)

    # Benign
    benign_train_samples = int(benign_percent/100*train_samples)
    rcc_benign_train_samples = int(rcc_benign_percentage/100*benign_train_samples)
    lcc_benign_train_samples = int(lcc_benign_percentage/100*benign_train_samples)
    lmlo_benign_train_samples = int(lmlo_benign_percentage/100*benign_train_samples)
    rmlo_benign_train_samples = int(rmlo_benign_percentage/100*benign_train_samples)

    extract_given(rcc_benign_train_samples, rcc_benign, rcc_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))
    extract_given(lcc_benign_train_samples, lcc_benign, lcc_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))
    extract_given(lmlo_benign_train_samples, lmlo_benign, lmlo_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))
    extract_given(rmlo_benign_train_samples, rmlo_benign, rmlo_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))

    # Malign
    malign_train_samples = int(malign_percent/100*train_samples)
    rcc_malign_train_samples = int(rcc_malign_percentage/100*malign_train_samples)
    lcc_malign_train_samples = int(lcc_malign_percentage/100*malign_train_samples)
    lmlo_malign_train_samples = int(lmlo_malign_percentage/100*malign_train_samples)
    rmlo_malign_train_samples = int(rmlo_malign_percentage/100*malign_train_samples)

    extract_given(rcc_malign_train_samples, rcc_malign, rcc_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))
    extract_given(lcc_malign_train_samples, lcc_malign, lcc_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))
    extract_given(lmlo_malign_train_samples, lmlo_malign, lmlo_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))
    extract_given(rmlo_malign_train_samples, rmlo_malign, rmlo_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))

    # Normal
    normal_train_samples = int(normal_percent/100*train_samples)
    rcc_normal_train_samples = int(rcc_normal_percentage/100*normal_train_samples)
    lcc_normal_train_samples = int(lcc_normal_percentage/100*normal_train_samples)
    lmlo_normal_train_samples = int(lmlo_normal_percentage/100*normal_train_samples)
    rmlo_normal_train_samples = int(rmlo_normal_percentage/100*normal_train_samples)

    extract_given(rcc_normal_train_samples, rcc_normal, rcc_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))
    extract_given(lcc_normal_train_samples, lcc_normal, lcc_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))
    extract_given(lmlo_normal_train_samples, lmlo_normal, lmlo_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))
    extract_given(rmlo_normal_train_samples, rmlo_normal, rmlo_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))

In [None]:
split_dataset(0.7, "/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images")

# Benign Images Analysis

First, let's just get how many benign images we have globally

In [14]:
benign_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_benign_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(benign_images)} GT benign images")

We have 449 GT benign images


Now let's see how many images type we have for these normal images. We have four different types, so when feeding the classifier these should be taken into consideration too

In [15]:
lmlo_benign, rmlo_benign, lcc_benign, rcc_benign = get_views_images(benign_images)

print("Benign Images per View Type")
print(f"LMLO: {len(lmlo_benign)} Images")
print(f"RMLO: {len(rmlo_benign)} Images")
print(f"LCC: {len(lcc_benign)} Images")
print(f"RCC: {len(rcc_benign)} Images")

Benign Images per View Type
LMLO: 176 Images
RMLO: 49 Images
LCC: 159 Images
RCC: 65 Images


We also want to see, per view type, how many "individuals" we have. Because we have a lot of mammographies that are the same varying just contrast, detail or colours. We want to make sure the division includes at least one sample of each individual

In [16]:
lmlo_benign_individuals, rmlo_benign_individuals, lcc_benign_individuals, rcc_benign_individuals = get_individuals_per_view_type(lmlo_benign, rmlo_benign, lcc_benign, rcc_benign)

print("Benign Images Individuals per View Type")
print(f"LMLO: {len(lmlo_benign_individuals)} Individuals")
print(f"RMLO: {len(rmlo_benign_individuals)} Individuals")
print(f"LCC: {len(lcc_benign_individuals)} Individuals")
print(f"RCC: {len(rcc_benign_individuals)} Individuals")

Benign Images Individuals per View Type
LMLO: 2 Individuals
RMLO: 1 Individuals
LCC: 2 Individuals
RCC: 1 Individuals


Okay so we have very few individuals, let's see how many we actually have

In [17]:
benign_individuals = get_individuals(benign_images)
print(f"Number of Benign Individuals: {len(benign_individuals)}")

Number of Benign Individuals: 4


# Malign Images Analysis

In [18]:
malign_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_malign_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(malign_images)} GT malign images")

We have 616 GT malign images


In [19]:
lmlo_malign, rmlo_malign, lcc_malign, rcc_malign = get_views_images(malign_images)

print("Malign Images per View Type")
print(f"LMLO: {len(lmlo_malign)} Images")
print(f"RMLO: {len(rmlo_malign)} Images")
print(f"LCC: {len(lcc_malign)} Images")
print(f"RCC: {len(rcc_malign)} Images")

Malign Images per View Type
LMLO: 142 Images
RMLO: 214 Images
LCC: 136 Images
RCC: 124 Images


In [20]:
lmlo_malign_individuals, rmlo_malign_individuals, lcc_malign_individuals, rcc_malign_individuals = get_individuals_per_view_type(lmlo_malign, rmlo_malign, lcc_malign, rcc_malign)

print("Malign Images Individuals per View Type")
print(f"LMLO: {len(lmlo_malign_individuals)} Individuals")
print(f"RMLO: {len(rmlo_malign_individuals)} Individuals")
print(f"LCC: {len(lcc_malign_individuals)} Individuals")
print(f"RCC: {len(rcc_malign_individuals)} Individuals")

Malign Images Individuals per View Type
LMLO: 2 Individuals
RMLO: 3 Individuals
LCC: 2 Individuals
RCC: 2 Individuals


In [21]:
malign_individuals = get_individuals(malign_images)
print(f"Number of Malign Individuals: {len(malign_individuals)}")

Number of Malign Individuals: 4


# Normal Images Analysis

In [22]:
normal_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_normal_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(normal_images)} GT normal images")

We have 978 GT normal images


In [23]:
lmlo_normal, rmlo_normal, lcc_normal, rcc_normal = get_views_images(normal_images)

print("Normal Images per View Type")
print(f"LMLO: {len(lmlo_normal)} Images")
print(f"RMLO: {len(rmlo_normal)} Images")
print(f"LCC: {len(lcc_normal)} Images")
print(f"RCC: {len(rcc_normal)} Images")

Normal Images per View Type
LMLO: 256 Images
RMLO: 265 Images
LCC: 230 Images
RCC: 227 Images


In [24]:
lmlo_normal_individuals, rmlo_normal_individuals, lcc_normal_individuals, rcc_normal_individuals = get_individuals_per_view_type(lmlo_normal, rmlo_normal, lcc_normal, rcc_normal)

print("Normal Images Individuals per View Type")
print(f"LMLO: {len(lmlo_normal_individuals)} Individuals")
print(f"RMLO: {len(rmlo_normal_individuals)} Individuals")
print(f"LCC: {len(lcc_normal_individuals)} Individuals")
print(f"RCC: {len(rcc_normal_individuals)} Individuals")

Normal Images Individuals per View Type
LMLO: 4 Individuals
RMLO: 4 Individuals
LCC: 4 Individuals
RCC: 4 Individuals


In [25]:
normal_individuals = get_individuals(normal_images)
print(f"Number of Normal Individuals: {len(normal_individuals)}")

Number of Normal Individuals: 4


# File For Tableau Analysis

In [None]:
# Export to file for analysing in Tableau
with open("images.txt", "a") as file:

    for elem in benign_images:
        file.write(elem + "_benign\n")

    for elem in malign_images:
        file.write(elem + "_malign\n")

    for elem in normal_images:
        file.write(elem + "_normal\n")