# Data splitting

Splitting all images in a structure suited for further analysis, i.a. subfolders *train, test and validation*.
Subfolders are *Benign and Malignant*.

The ratio of images per folder can be defined.

In [7]:
import pandas as pd
import numpy as np
import shutil
import os     

In [8]:
dir_images = "/home/dwx/Documents/Studium/DTU/11/DL Deep Learning/project/ISIC_MSK-2_1_images/"
dir_images_split = "/home/dwx/Documents/Studium/DTU/11/DL Deep Learning/project/ISIC_MSK-2_1_1100/"
dir_metadata = "/home/dwx/Documents/Studium/DTU/11/DL Deep Learning/project/git/ISIC/csv_metadata/"

In [9]:
def get_image_class(metadata_file):
    #inputs: The file name where the analysis data is 
    #ouputs: an np.array with in col 0 the name of the file and in col 1 the label
    data = pd.read_csv(metadata_file)
    
    df = pd.DataFrame(data, columns=["name", "benign_malignant"])
    return df

metadata_file = dir_metadata + "ISIC_MSK-2_1.csv"
df = get_image_class( metadata_file )

name_set_meta = set(df["name"])

In [23]:
name_set_image = []
for file in os.listdir(dir_images)[:]: # progressbar.log_progress(os.listdir(cur_path)[:]):
    if file.endswith(".jpg"):
        name_set_image += [file[:-4]]

In [11]:
image_count = len(name_set_image)
amount = [1100, 300, image_count-1400]
print("train, validate, test", amount)

train, validate, test [1100, 300, 118]


In [14]:
choice_array = np.random.choice(image_count, size=image_count, replace=False)

### Ratio benign/total

In [1]:
def get_image_to_metadata(index):
    return df[df["name"] == name_set_image[index]]

Ratio of benign images in each folder, output printed below.

In [57]:
choice_train = choice_array[:amount[0]]
choice_val = choice_array[amount[0]:amount[0]+amount[1]]
choice_test = choice_array[amount[0]+amount[1]:]

for a in [choice_train,choice_val,choice_test]:
    cur_count_benign = 0
    cur_amount = len(a)
    for cur_id in a:
        cur_class = get_image_to_metadata(cur_id).iloc[0]["benign_malignant"]
        if cur_class == "benign":
            cur_count_benign += 1
    print(cur_count_benign/cur_amount)

0.7809090909090909
0.7366666666666667
0.7372881355932204


In [58]:
print(len(set(choice_train)))
print(len(set(choice_val)))
print(len(set(choice_test)))

1100
300
118


Copying files to each subfolder.

In [None]:
train_data_dir = "train/"
validation_data_dir = "validation/"
test_data_dir = "test/"

for a,path in [(choice_train,train_data_dir),
          (choice_val,validation_data_dir),
          (choice_test,test_data_dir)]:
    cur_path_from = dir_images
    cur_path_to = dir_images_split + path
    print(cur_path_from, cur_path_to)

    for cur_id in a:
        cur_image = get_image_to_metadata(cur_id)
        cur_class = cur_image.iloc[0]["benign_malignant"]
        file_suffix = cur_image.iloc[0]["name"] + ".jpg"
        if cur_class == "benign":
            shutil.copy(cur_path_from + file_suffix, cur_path_to+'Benign/' + file_suffix)
        else:
            shutil.copy(cur_path_from + file_suffix, cur_path_to+'Malignant/' + file_suffix)