In [None]:
import pandas as pd
import numpy as np

In [11]:
ground_truth_labels = pd.read_csv(
    "ISIC_2024_Permissive_Training_GroundTruth.csv", header=0
)
ground_truth_labels["malignant"].value_counts()

malignant
0.0    217183
1.0       294
Name: count, dtype: int64

In [12]:
# Number of benign images to sample
benign_sample_num = 10000 - 294

In [13]:
ground_truth_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217477 entries, 0 to 217476
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   isic_id    217477 non-null  object 
 1   malignant  217477 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.3+ MB


In [14]:
complete_ground_truth_labels = pd.read_csv(
    "ISIC_2024_Training_GroundTruth.csv", header=0
)
complete_ground_truth_labels["malignant"].value_counts()

malignant
0.0    400666
1.0       393
Name: count, dtype: int64

In [15]:
# Find if there are duplicates between SLICE-3D and SLICE-3D Permissive datasets
pd.Series(
    np.intersect1d(
        ground_truth_labels["isic_id"].values,
        complete_ground_truth_labels["isic_id"].values,
    )
)

0         ISIC_0015670
1         ISIC_0015845
2         ISIC_0015864
3         ISIC_0015902
4         ISIC_0024200
              ...     
217472    ISIC_9999854
217473    ISIC_9999919
217474    ISIC_9999951
217475    ISIC_9999960
217476    ISIC_9999967
Length: 217477, dtype: object

As shown above, SLICE-3D (400k samples) are inclusive of SLICE-3D Permissive. So if we were to sample test data from SLICE-3D, we start from the entries indexed at 217477 and onward to avoid using the same data for training.

In [16]:
isic_ids = ground_truth_labels.loc[ground_truth_labels["malignant"] == 0, "isic_id"]
sampled_benign_ids = isic_ids.sample(n=benign_sample_num, random_state=1, replace=False)
sampled_benign_ids[:10]

93394     ISIC_4349311
126189    ISIC_5855526
33972     ISIC_1616913
26090     ISIC_1255517
203592    ISIC_9367679
101538    ISIC_4724653
208683    ISIC_9596614
33472     ISIC_1594138
202809    ISIC_9330731
197920    ISIC_9109189
Name: isic_id, dtype: object

In [17]:
malignant_ids = ground_truth_labels.loc[
    ground_truth_labels["malignant"] == 1, "isic_id"
]
len(malignant_ids)

294

In [None]:
df_benign = pd.DataFrame({"isic_id": sampled_benign_ids, "label": 0})
df_malignant = pd.DataFrame({"isic_id": malignant_ids, "label": 1})
df_all = pd.concat([df_benign, df_malignant], ignore_index=True)
df_all.to_csv("labels.csv", index=False)

In [2]:
import shutil
from pathlib import Path
import os

In [None]:
print("Current working directory:", os.getcwd())

image_dir = Path(
    r"ISIC_2024_Permissive_Training_Input/ISIC_2024_Permissive_Training_Input"
)
output_dir = Path("filtered_benign_images")

output_dir.mkdir(exist_ok=True)

for isic_id in sampled_benign_ids:
    src = image_dir / f"{isic_id}.jpg"
    dst = output_dir / f"{isic_id}.jpg"
    if src.exists():
        shutil.copy(src, dst)

In [None]:
output_dir = Path("filtered_malignant_images")

output_dir.mkdir(exist_ok=True)

for isic_id in malignant_ids:
    src = image_dir / f"{isic_id}.jpg"
    dst = output_dir / f"{isic_id}.jpg"
    if src.exists():
        shutil.copy(src, dst)

In [None]:
# isic_ids from the rest of the full SLICE-3D dataset
test_sampling = complete_ground_truth_labels[217477:]

test_sampling["malignant"].value_counts()

malignant
0.0    183402
1.0       180
Name: count, dtype: int64

In [None]:
testing_ids = test_sampling["isic_id"]
expected_file_names = [f"{isic_id}.jpg" for isic_id in testing_ids]
len(expected_file_names)

183582

In [None]:
# Copy over the non-overlapping images from SLICE-3D image archive to a separate folder
# for further sampling for validation and test sets

archive_root = r".\ISIC_2024_Training_Input"
dest_folder = r".\val_and_test_samples"


def bulk_copy(needed_files, archive_root, dest_folder):
    needed = set(needed_files) - set(os.listdir(dest_folder))
    os.makedirs(dest_folder, exist_ok=True)

    for root, _, files in os.walk(archive_root):
        for f in files:
            if f in needed:
                src = os.path.join(archive_root, f)
                dst = os.path.join(dest_folder, f)
                if not os.path.exists(dst):
                    shutil.copy(src, dst)
                    needed.remove(f)

                if not needed:
                    return


bulk_copy(
    expected_file_names,
    archive_root,
    dest_folder,
)

In [None]:
# Sample 1000 images for validation and test sets respectively. Since there are 180 malignant images
# from SLICE-3D file that are not used in the train set, split these 180 samples 50/50 for validation
# and test

validation_num = 1_000
validation_malignant = 90

test_num = 1_000
test_malignant = 90

benign_sample_ids = test_sampling.loc[test_sampling["malignant"] == 0]
malignant_sample_ids = test_sampling.loc[test_sampling["malignant"] == 1]

# Sample validation set for benign and malignant classes
benign_val_samples = benign_sample_ids.sample(
    n=validation_num - validation_malignant, random_state=1, replace=False
)
malignant_val_samples = malignant_sample_ids.sample(
    n=validation_malignant, random_state=1, replace=False
)


# Sample test set for benign and malignant classes
remaining_benign_ids = benign_sample_ids[
    ~benign_sample_ids["isic_id"].isin(benign_val_samples["isic_id"])
]
benign_test_samples = remaining_benign_ids.sample(
    n=test_num - test_malignant, random_state=1, replace=False
)
malignant_test_samples = malignant_sample_ids[
    ~malignant_sample_ids["isic_id"].isin(malignant_val_samples["isic_id"])
]

Unnamed: 0,isic_id,malignant
400260,ISIC_9980921,0.0
368269,ISIC_9186636,0.0
295236,ISIC_7388916,0.0
254399,ISIC_6389483,0.0
398494,ISIC_9937229,0.0
...,...,...
256567,ISIC_6441176,0.0
291166,ISIC_7286349,0.0
351189,ISIC_8769948,0.0
318190,ISIC_7954681,0.0


In [3]:
# print("Current working directory:", os.getcwd())
new_path = str(Path.home() / "Downloads")
os.chdir(new_path)

In [None]:
# Copy over the samples images into validation and test folders

# Copy over benign samples for validation set
image_dir = Path("val_and_test_samples")
output_dir = Path("skin_cancer_val/benign")

output_dir.mkdir(exist_ok=True)

for isic_id in benign_val_samples["isic_id"]:
    src = image_dir / f"{isic_id}.jpg"
    dst = output_dir / f"{isic_id}.jpg"
    if src.exists():
        shutil.copy(src, dst)

# Copy over malignant samples for validation set
output_dir = Path("skin_cancer_val/malignant")

output_dir.mkdir(exist_ok=True)

for isic_id in malignant_val_samples["isic_id"]:
    src = image_dir / f"{isic_id}.jpg"
    dst = output_dir / f"{isic_id}.jpg"
    if src.exists():
        shutil.copy(src, dst)

In [None]:
# Copy over benign samples for test set

image_dir = Path("val_and_test_samples")
output_dir = Path("skin_cancer_test/benign")

output_dir.mkdir(exist_ok=True)

for isic_id in benign_test_samples["isic_id"]:
    src = image_dir / f"{isic_id}.jpg"
    dst = output_dir / f"{isic_id}.jpg"
    if src.exists():
        shutil.copy(src, dst)

# Copy over malignant samples for test set
output_dir = Path("skin_cancer_test/malignant")

output_dir.mkdir(exist_ok=True)

for isic_id in malignant_test_samples["isic_id"]:
    src = image_dir / f"{isic_id}.jpg"
    dst = output_dir / f"{isic_id}.jpg"
    if src.exists():
        shutil.copy(src, dst)