# Imports

In [1]:
import os
import shutil
import random

# Utils

In [2]:
def get_views_images(images):
    """

    @param images:
    @return:
    """
    lmlo = [im for im in images if "lmlo" in im]
    rmlo = [im for im in images if "rmlo" in im]
    lcc = [im for im in images if "lcc" in im]
    rcc = [im for im in images if "rcc" in im]
    return lmlo, rmlo, lcc, rcc

In [3]:
def get_individuals_per_view_type(lmlos, rmlos, lccs, rccs):
    """

    @param lmlos:
    @param rmlos:
    @param lccs:
    @param rccs:
    @return:
    """
    lmlo_individuals = []
    rmlo_individuals = []
    lcc_individuals = []
    rcc_individuals = []

    for elem in lmlos:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in lmlo_individuals:
            lmlo_individuals.append(curr)

    for elem in rmlos:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in rmlo_individuals:
            rmlo_individuals.append(curr)

    for elem in lccs:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in lcc_individuals:
            lcc_individuals.append(curr)

    for elem in rccs:
        curr = "_".join(elem.split("_")[:-1])
        if curr not in rcc_individuals:
            rcc_individuals.append(curr)

    return lmlo_individuals, rmlo_individuals, lcc_individuals, rcc_individuals

In [4]:
def get_individuals(images):
    """

    @param images:
    @return:
    """
    individuals = []

    for elem in images:
        curr = elem.split("_")[0]
        if curr not in individuals:
            individuals.append(curr)

    return individuals

# Benign Images Analysis

First, let's just get how many benign images we have globally

In [None]:
benign_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_benign_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(benign_images)} GT benign images")

Now let's see how many images type we have for these normal images. We have four different types, so when feeding the classifier these should be taken into consideration too

In [None]:
lmlo_benign, rmlo_benign, lcc_benign, rcc_benign = get_views_images(benign_images)

print("Benign Images per View Type")
print(f"LMLO: {len(lmlo_benign)} Images")
print(f"RMLO: {len(rmlo_benign)} Images")
print(f"LCC: {len(lcc_benign)} Images")
print(f"RCC: {len(rcc_benign)} Images")

We also want to see, per view type, how many "individuals" we have. Because we have a lot of mammographies that are the same varying just contrast, detail or colours. We want to make sure the division includes at least one sample of each individual

In [None]:
lmlo_benign_individuals, rmlo_benign_individuals, lcc_benign_individuals, rcc_benign_individuals = get_individuals_per_view_type(lmlo_benign, rmlo_benign, lcc_benign, rcc_benign)

print("Benign Images Individuals per View Type")
print(f"LMLO: {len(lmlo_benign_individuals)} Individuals")
print(f"RMLO: {len(rmlo_benign_individuals)} Individuals")
print(f"LCC: {len(lcc_benign_individuals)} Individuals")
print(f"RCC: {len(rcc_benign_individuals)} Individuals")

Okay so we have very few individuals, let's see how many we actually have

In [None]:
benign_individuals = get_individuals(benign_images)
print(f"Number of Benign Individuals: {len(benign_individuals)}")

# Malign Images Analysis

In [None]:
malign_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_malign_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(malign_images)} GT malign images")

In [None]:
lmlo_malign, rmlo_malign, lcc_malign, rcc_malign = get_views_images(malign_images)

print("Malign Images per View Type")
print(f"LMLO: {len(lmlo_malign)} Images")
print(f"RMLO: {len(rmlo_malign)} Images")
print(f"LCC: {len(lcc_malign)} Images")
print(f"RCC: {len(rcc_malign)} Images")

In [None]:
lmlo_malign_individuals, rmlo_malign_individuals, lcc_malign_individuals, rcc_malign_individuals = get_individuals_per_view_type(lmlo_malign, rmlo_malign, lcc_malign, rcc_malign)

print("Malign Images Individuals per View Type")
print(f"LMLO: {len(lmlo_malign_individuals)} Individuals")
print(f"RMLO: {len(rmlo_malign_individuals)} Individuals")
print(f"LCC: {len(lcc_malign_individuals)} Individuals")
print(f"RCC: {len(rcc_malign_individuals)} Individuals")

In [None]:
malign_individuals = get_individuals(malign_images)
print(f"Number of Malign Individuals: {len(malign_individuals)}")

# Normal Images Analysis

In [None]:
normal_images = [im for im in os.listdir("/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images/CN_normal_images") if "_mask" not in im if not im.startswith(".")]
print(f"We have {len(normal_images)} GT normal images")

In [None]:
lmlo_normal, rmlo_normal, lcc_normal, rcc_normal = get_views_images(normal_images)

print("Normal Images per View Type")
print(f"LMLO: {len(lmlo_normal)} Images")
print(f"RMLO: {len(rmlo_normal)} Images")
print(f"LCC: {len(lcc_normal)} Images")
print(f"RCC: {len(rcc_normal)} Images")

In [None]:
lmlo_normal_individuals, rmlo_normal_individuals, lcc_normal_individuals, rcc_normal_individuals = get_individuals_per_view_type(lmlo_normal, rmlo_normal, lcc_normal, rcc_normal)

print("Normal Images Individuals per View Type")
print(f"LMLO: {len(lmlo_normal_individuals)} Individuals")
print(f"RMLO: {len(rmlo_normal_individuals)} Individuals")
print(f"LCC: {len(lcc_normal_individuals)} Individuals")
print(f"RCC: {len(rcc_normal_individuals)} Individuals")

In [None]:
normal_individuals = get_individuals(normal_images)
print(f"Number of Normal Individuals: {len(normal_individuals)}")

In [None]:
normal_individuals

# Split Dataset for Training and Test

In [5]:
def extract_given(train_samples, images, individuals, train_folder, test_folder, root_folder):
    for individual in individuals:
        curr_n_samples = int(train_samples/len(individuals))
        curr_target = [elem for elem in images if individual in elem]
        random.shuffle(curr_target)
        train_one = curr_target[:curr_n_samples]
        test_one = curr_target[curr_n_samples:]
        for to in train_one:
            shutil.copy(os.path.join(root_folder, to), os.path.join(train_folder, to))
        for tto in test_one:
            shutil.copy(os.path.join(root_folder, tto), os.path.join(test_folder, tto))

In [6]:
def split_dataset(train_percentage, base_path):
    ##########################################
    # GET THE IMAGES INFO
    ##########################################
    normal_images = [im for im in os.listdir(os.path.join(base_path, "CN_normal_images")) if "_mask" not in im if not im.startswith(".")]
    lmlo_normal, rmlo_normal, lcc_normal, rcc_normal = get_views_images(normal_images)
    lmlo_normal_individuals, rmlo_normal_individuals, lcc_normal_individuals, rcc_normal_individuals = get_individuals_per_view_type(lmlo_normal, rmlo_normal, lcc_normal, rcc_normal)

    malign_images = [im for im in os.listdir(os.path.join(base_path, "CN_malign_images")) if "_mask" not in im if not im.startswith(".")]
    lmlo_malign, rmlo_malign, lcc_malign, rcc_malign = get_views_images(malign_images)
    lmlo_malign_individuals, rmlo_malign_individuals, lcc_malign_individuals, rcc_malign_individuals = get_individuals_per_view_type(lmlo_malign, rmlo_malign, lcc_malign, rcc_malign)

    # Benign
    benign_images = [im for im in os.listdir(os.path.join(base_path, "CN_benign_images")) if "_mask" not in im if not im.startswith(".")]
    lmlo_benign, rmlo_benign, lcc_benign, rcc_benign = get_views_images(benign_images)
    lmlo_benign_individuals, rmlo_benign_individuals, lcc_benign_individuals, rcc_benign_individuals = get_individuals_per_view_type(lmlo_benign, rmlo_benign, lcc_benign, rcc_benign)

    ##########################################
    # CALCULATES PERCENTAGES
    ##########################################
    # Get samples and percentage of general
    total_samples = len(benign_images) + len(malign_images) + len(normal_images)
    benign_percent = int((len(benign_images)/total_samples)*100)
    malign_percent = int((len(malign_images)/total_samples)*100)
    normal_percent = int((len(normal_images)/total_samples)*100)

    # Get percentages for each view for image type
    # Benign
    rcc_benign_percentage = int((len(rcc_benign)/len(benign_images))*100)
    lcc_benign_percentage = int((len(lcc_benign)/len(benign_images))*100)
    rmlo_benign_percentage = int((len(rmlo_benign)/len(benign_images))*100)
    lmlo_benign_percentage = int((len(lmlo_benign)/len(benign_images))*100)

    # Malign
    rcc_malign_percentage = int((len(rcc_malign)/len(malign_images))*100)
    lcc_malign_percentage = int((len(lcc_malign)/len(malign_images))*100)
    rmlo_malign_percentage = int((len(rmlo_malign)/len(malign_images))*100)
    lmlo_malign_percentage = int((len(lmlo_malign)/len(malign_images))*100)

    # Normal
    rcc_normal_percentage = int((len(rcc_normal)/len(normal_images))*100)
    lcc_normal_percentage = int((len(lcc_normal)/len(normal_images))*100)
    rmlo_normal_percentage = int((len(rmlo_normal)/len(normal_images))*100)
    lmlo_normal_percentage = int((len(lmlo_normal)/len(normal_images))*100)


    ##########################################
    # CREATES FOLDERS
    ##########################################
    base_folder = "CN_data"
    train_folder = "train"
    test_folder = "test"
    malign_folder = "malign"
    benign_folder = "benign"
    normal_folder = "normal"

    os.mkdir(base_folder)
    os.mkdir(os.path.join(base_folder, train_folder))
    os.mkdir(os.path.join(base_folder, test_folder))

    os.mkdir(os.path.join(base_folder, train_folder, malign_folder))
    os.mkdir(os.path.join(base_folder, train_folder, benign_folder))
    os.mkdir(os.path.join(base_folder, train_folder, normal_folder))

    os.mkdir(os.path.join(base_folder, test_folder, malign_folder))
    os.mkdir(os.path.join(base_folder, test_folder, benign_folder))
    os.mkdir(os.path.join(base_folder, test_folder, normal_folder))


    ##########################################
    # SPLIT SAMPLES ACCORDING TO RATE
    ##########################################
    train_samples = int(train_percentage*total_samples)

    # Benign
    benign_train_samples = int(benign_percent/100*train_samples)
    rcc_benign_train_samples = int(rcc_benign_percentage/100*benign_train_samples)
    lcc_benign_train_samples = int(lcc_benign_percentage/100*benign_train_samples)
    lmlo_benign_train_samples = int(lmlo_benign_percentage/100*benign_train_samples)
    rmlo_benign_train_samples = int(rmlo_benign_percentage/100*benign_train_samples)

    extract_given(rcc_benign_train_samples, rcc_benign, rcc_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))
    extract_given(lcc_benign_train_samples, lcc_benign, lcc_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))
    extract_given(lmlo_benign_train_samples, lmlo_benign, lmlo_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))
    extract_given(rmlo_benign_train_samples, rmlo_benign, rmlo_benign_individuals, os.path.join(base_folder, train_folder, benign_folder), os.path.join(base_folder, test_folder, benign_folder), os.path.join(base_path, "CN_benign_images"))

    # Malign
    malign_train_samples = int(malign_percent/100*train_samples)
    rcc_malign_train_samples = int(rcc_malign_percentage/100*malign_train_samples)
    lcc_malign_train_samples = int(lcc_malign_percentage/100*malign_train_samples)
    lmlo_malign_train_samples = int(lmlo_malign_percentage/100*malign_train_samples)
    rmlo_malign_train_samples = int(rmlo_malign_percentage/100*malign_train_samples)

    extract_given(rcc_malign_train_samples, rcc_malign, rcc_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))
    extract_given(lcc_malign_train_samples, lcc_malign, lcc_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))
    extract_given(lmlo_malign_train_samples, lmlo_malign, lmlo_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))
    extract_given(rmlo_malign_train_samples, rmlo_malign, rmlo_malign_individuals, os.path.join(base_folder, train_folder, malign_folder), os.path.join(base_folder, test_folder, malign_folder), os.path.join(base_path, "CN_malign_images"))

    # Normal
    normal_train_samples = int(normal_percent/100*train_samples)
    rcc_normal_train_samples = int(rcc_normal_percentage/100*normal_train_samples)
    lcc_normal_train_samples = int(lcc_normal_percentage/100*normal_train_samples)
    lmlo_normal_train_samples = int(lmlo_normal_percentage/100*normal_train_samples)
    rmlo_normal_train_samples = int(rmlo_normal_percentage/100*normal_train_samples)

    extract_given(rcc_normal_train_samples, rcc_normal, rcc_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))
    extract_given(lcc_normal_train_samples, lcc_normal, lcc_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))
    extract_given(lmlo_normal_train_samples, lmlo_normal, lmlo_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))
    extract_given(rmlo_normal_train_samples, rmlo_normal, rmlo_normal_individuals, os.path.join(base_folder, train_folder, normal_folder), os.path.join(base_folder, test_folder, normal_folder), os.path.join(base_path, "CN_normal_images"))

In [7]:
split_dataset(0.7, "/Users/josedaviddomingues/Desktop/Datasets/segmentation_dataset/CN_images")

# File For Tableau Analysis

In [None]:
# Export to file for analysing in Tableau
with open("images.txt", "a") as file:

    for elem in benign_images:
        file.write(elem + "_benign\n")

    for elem in malign_images:
        file.write(elem + "_malign\n")

    for elem in normal_images:
        file.write(elem + "_normal\n")