# Imports

In [2]:
import os

# Utils

In [3]:
def get_views_images(images):
    """

    @param images:
    @return:
    """
    lmlo = [im for im in images if "lmlo" in im]
    rmlo = [im for im in images if "rmlo" in im]
    lcc = [im for im in images if "lcc" in im]
    rcc = [im for im in images if "rcc" in im]
    return lmlo, rmlo, lcc, rcc

In [4]:
def get_individuals_per_view_type(lmlos, rmlos, lccs, rccs):
    """

    @param lmlos:
    @param rmlos:
    @param lccs:
    @param rccs:
    @return:
    """
    lmlo_individuals = []
    rmlo_individuals = []
    lcc_individuals = []
    rcc_individuals = []

    for elem in lmlos:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in lmlo_individuals:
            lmlo_individuals.append(curr)

    for elem in rmlos:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in rmlo_individuals:
            rmlo_individuals.append(curr)

    for elem in lccs:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in lcc_individuals:
            lcc_individuals.append(curr)

    for elem in rccs:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in rcc_individuals:
            rcc_individuals.append(curr)

    return lmlo_individuals, rmlo_individuals, lcc_individuals, rcc_individuals

In [5]:
def get_individuals(images):
    """

    @param images:
    @return:
    """
    individuals = []

    for elem in images:
        curr = elem.split("_")[0]
        if curr not in individuals:
            individuals.append(curr)

    return individuals

# Benign Images Analysis

First, let's just get how many benign images we have globally

In [6]:
benign_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_benign_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(benign_images)} GT benign images")

We have 449 GT benign images


Now let's see how many images type we have for these normal images. We have four different types, so when feeding the classifier these should be taken into consideration too

In [7]:
lmlo_benign, rmlo_benign, lcc_benign, rcc_benign = get_views_images(benign_images)

print("Benign Images per View Type")
print(f"LMLO: {len(lmlo_benign)} Images")
print(f"RMLO: {len(rmlo_benign)} Images")
print(f"LCC: {len(lcc_benign)} Images")
print(f"RCC: {len(rcc_benign)} Images")

Benign Images per View Type
LMLO: 176 Images
RMLO: 49 Images
LCC: 159 Images
RCC: 65 Images


We also want to see, per view type, how many "individuals" we have. Because we have a lot of mammographies that are the same varying just contrast, detail or colours. We want to make sure the division includes at least one sample of each individual

In [8]:
lmlo_benign_individuals, rmlo_benign_individuals, lcc_benign_individuals, rcc_benign_individuals = get_individuals_per_view_type(lmlo_benign, rmlo_benign, lcc_benign, rcc_benign)

print("Benign Images Individuals per View Type")
print(f"LMLO: {len(lmlo_benign_individuals)} Individuals")
print(f"RMLO: {len(rmlo_benign_individuals)} Individuals")
print(f"LCC: {len(lcc_benign_individuals)} Individuals")
print(f"RCC: {len(rcc_benign_individuals)} Individuals")

Benign Images Individuals per View Type
LMLO: 2 Individuals
RMLO: 1 Individuals
LCC: 2 Individuals
RCC: 1 Individuals


Okay so we have very few individuals, let's see how many we actually have

In [9]:
benign_individuals = get_individuals(benign_images)
print(f"Number of Benign Individuals: {len(benign_individuals)}")

Number of Benign Individuals: 4


# Malign Images Analysis

In [10]:
malign_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_malign_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(malign_images)} GT malign images")

We have 616 GT malign images


In [11]:
lmlo_malign, rmlo_malign, lcc_malign, rcc_malign = get_views_images(malign_images)

print("Malign Images per View Type")
print(f"LMLO: {len(lmlo_malign)} Images")
print(f"RMLO: {len(rmlo_malign)} Images")
print(f"LCC: {len(lcc_malign)} Images")
print(f"RCC: {len(rcc_malign)} Images")

Malign Images per View Type
LMLO: 142 Images
RMLO: 214 Images
LCC: 136 Images
RCC: 124 Images


In [12]:
lmlo_malign_individuals, rmlo_malign_individuals, lcc_malign_individuals, rcc_malign_individuals = get_individuals_per_view_type(lmlo_malign, rmlo_malign, lcc_malign, rcc_malign)

print("Malign Images Individuals per View Type")
print(f"LMLO: {len(lmlo_malign_individuals)} Individuals")
print(f"RMLO: {len(rmlo_malign_individuals)} Individuals")
print(f"LCC: {len(lcc_malign_individuals)} Individuals")
print(f"RCC: {len(rcc_malign_individuals)} Individuals")

Malign Images Individuals per View Type
LMLO: 2 Individuals
RMLO: 3 Individuals
LCC: 2 Individuals
RCC: 2 Individuals


In [13]:
malign_individuals = get_individuals(malign_images)
print(f"Number of Malign Individuals: {len(malign_individuals)}")

Number of Malign Individuals: 4


# Normal Images Analysis

In [14]:
normal_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_normal_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(normal_images)} GT normal images")

We have 978 GT normal images


In [15]:
lmlo_normal, rmlo_normal, lcc_normal, rcc_normal = get_views_images(normal_images)

print("Normal Images per View Type")
print(f"LMLO: {len(lmlo_normal)} Images")
print(f"RMLO: {len(rmlo_normal)} Images")
print(f"LCC: {len(lcc_normal)} Images")
print(f"RCC: {len(rcc_normal)} Images")

Normal Images per View Type
LMLO: 256 Images
RMLO: 265 Images
LCC: 230 Images
RCC: 227 Images


In [16]:
lmlo_normal_individuals, rmlo_normal_individuals, lcc_normal_individuals, rcc_normal_individuals = get_individuals_per_view_type(lmlo_normal, rmlo_normal, lcc_normal, rcc_normal)

print("Normal Images Individuals per View Type")
print(f"LMLO: {len(lmlo_normal_individuals)} Individuals")
print(f"RMLO: {len(rmlo_normal_individuals)} Individuals")
print(f"LCC: {len(lcc_normal_individuals)} Individuals")
print(f"RCC: {len(rcc_normal_individuals)} Individuals")

Normal Images Individuals per View Type
LMLO: 4 Individuals
RMLO: 4 Individuals
LCC: 4 Individuals
RCC: 4 Individuals


In [17]:
normal_individuals = get_individuals(normal_images)
print(f"Number of Normal Individuals: {len(normal_individuals)}")

Number of Normal Individuals: 4


# Split Dataset for Training and Test

In [18]:
# Total number of samples
print(f"Total Samples: {len(benign_images) + len(malign_images) + len(normal_images)}")

Total Samples: 2043


In [None]:
def split_dataset(train_percentage, base_path):
    normal_images = [im for im in os.listdir(os.path.join(base_path, "CN_normal_images")) if "_mask" not in im if not im.startswith(".")]
    malign_images = [im for im in os.listdir(os.path.join(base_path, "CN_malign_images")) if "_mask" not in im if not im.startswith(".")]
    benign_images = [im for im in os.listdir(os.path.join(base_path, "CN_benign_images")) if "_mask" not in im if not im.startswith(".")]

    total_samples = len(benign_images) + len(malign_images) + len(normal_images)
    train_samples = int(train_percentage*total_samples)

In [None]:
split_dataset(0.7)

# File For Tableau Analysis

In [38]:
# Export to file for analysing in Tableau
with open("images.txt", "a") as file:

    for elem in benign_images:
        file.write(elem + "_benign\n")

    for elem in malign_images:
        file.write(elem + "_malign\n")

    for elem in normal_images:
        file.write(elem + "_normal\n")