In [21]:
import pandas as pd
import shutil
import os
import numpy as np

Combine metadatas

In [3]:
def combine_metadata(metadata1_filename, metadata2_filename):
    combined = pd.concat([pd.read_csv(metadata1_filename), pd.read_csv(metadata2_filename)])
    lesions = combined['meta.clinical.diagnosis'].unique()
    print('Lesions on this dataset: ', lesions)
    combined.drop_duplicates(inplace=True)
    return combined

In [4]:
SOURCE_DATA_PATH = '../datasets/raw/ISIC/'

In [33]:
metadata = combine_metadata(metadata1_filename=SOURCE_DATA_PATH + 'metadata1.csv',
                 metadata2_filename=SOURCE_DATA_PATH +'metadata2.csv')

metadata.head()

Lesions on this dataset:  ['nevus' 'melanoma' 'solar lentigo' 'actinic keratosis' 'vascular lesion'
 'seborrheic keratosis' 'lentigo NOS']


Unnamed: 0,_id,name,meta.clinical.age_approx,meta.clinical.anatom_site_general,meta.clinical.benign_malignant,meta.clinical.clin_size_long_diam_mm,meta.clinical.diagnosis,meta.clinical.diagnosis_confirm_type,meta.clinical.family_hx_mm,meta.clinical.lesion_id,...,meta.clinical.sex,meta.acquisition.acquisition_day,meta.acquisition.blurry,meta.acquisition.color_tint,meta.acquisition.dermoscopic_type,meta.acquisition.hairy,meta.acquisition.image_type,meta.acquisition.marker_pen,meta.acquisition.pixelsX,meta.acquisition.pixelsY
0,5c95461adb6d7c19d0bc1319,ISIC_0057321,10.0,palms/soles,benign,,nevus,,,,...,male,,,,,,,,1024,1024
1,5c95461adb6d7c19d0bc131c,ISIC_0057324,45.0,palms/soles,malignant,,melanoma,histopathology,,,...,male,,,,,,,,1024,1024
2,5c95461bdb6d7c19d0bc1345,ISIC_0057365,70.0,palms/soles,benign,,nevus,histopathology,,,...,female,,,,,,,,1024,1024
3,5c954624db6d7c19d0bc1445,ISIC_0057621,55.0,palms/soles,malignant,,melanoma,histopathology,,,...,female,,,,,,,,1024,1024
4,5c954626db6d7c19d0bc1487,ISIC_0057687,60.0,palms/soles,malignant,,melanoma,histopathology,,,...,male,,,,,,,,1024,1024


Remove lesions used for training the model

In [34]:
df_train = pd.read_csv(SOURCE_DATA_PATH + 'train_split_1.csv')

In [35]:
print('Number of acral images used for training: ', df_train[df_train.image.isin(metadata.name)].shape[0])

Number of acral images used for training:  251


In [36]:
test_isic = metadata[~metadata.name.isin(df_train.image)]
metadata = test_isic
print('Total of acral images for testing: ', test_isic.shape[0])

Total of acral images for testing:  149


In [37]:
melanoma_imgs = test_isic[test_isic['meta.clinical.diagnosis'] == 'melanoma']
benign_imgs = test_isic[test_isic['meta.clinical.diagnosis'] != 'melanoma']

In [38]:
print('Total of MELANOMA images for testing: ', melanoma_imgs.shape[0])
print('Total of BENIGN images for testing: ', benign_imgs.shape[0])

Total of MELANOMA images for testing:  72
Total of BENIGN images for testing:  77


Save filtered images and metadata

In [39]:
datasets_isic = ['MSK-1',
                'ISIC 2020 Challenge - MSKCC contribution',
                'BCN_20000',
                'UDA-2',
                'UDA-1',
                'MSK-4',
                'MSK-2',
                'BCN_2020_Challenge',
                'HAM10000']

In [40]:
melanoma_img_filenames = [str(img) + '.jpg' for img in melanoma_imgs.name]
benign_img_filenames = [str(img) + '.jpg' for img in benign_imgs.name]

In [41]:
DESTINATION_DATASET_PATH = '../datasets/processed/ISIC/'

if not os.path.exists(DESTINATION_DATASET_PATH):

    # if the demo_folder directory is not present
    # then create it
    os.makedirs(DESTINATION_DATASET_PATH, exist_ok=True)

In [42]:
if not os.path.exists(DESTINATION_DATASET_PATH + 'melanoma'):

    # if the demo_folder directory is not present
    # then create it
    os.makedirs(DESTINATION_DATASET_PATH + 'melanoma', exist_ok=True)

if not os.path.exists(DESTINATION_DATASET_PATH + 'benign'):

    # if the demo_folder directory is not present
    # then create it
    os.makedirs(DESTINATION_DATASET_PATH + 'benign', exist_ok=True)

In [56]:
for dataset in datasets_isic:
    dir_files = os.listdir(SOURCE_DATA_PATH + dataset)
#     print(len(dir_files))
    images_dir = [file for file in dir_files if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp'))]
#     print(len(images_dir))
#     print(images_dir)
    for img in images_dir:
        # print(img)
        if img in melanoma_img_filenames:
            filename = (DESTINATION_DATASET_PATH + 'melanoma/' + img)
            shutil.copy(SOURCE_DATA_PATH + dataset +'/'+img, DESTINATION_DATASET_PATH + 'melanoma/')
        elif img in benign_img_filenames:
            filename = (DESTINATION_DATASET_PATH + 'benign/' + img)
            shutil.copy(SOURCE_DATA_PATH + dataset +'/'+img, DESTINATION_DATASET_PATH + 'benign/')
        
        metadata.loc[metadata.name == img.split('.')[0], 'lesion_filename'] = filename[3:]

In [57]:
metadata['label'] = metadata['meta.clinical.diagnosis'].apply(lambda x: 1 if x == 'melanoma' else 0)
metadata['is_dermoscopic'] = True

In [58]:
metadata.to_csv(DESTINATION_DATASET_PATH + 'metadata.csv', index=False)

In [59]:
metadata

Unnamed: 0,_id,name,meta.clinical.age_approx,meta.clinical.anatom_site_general,meta.clinical.benign_malignant,meta.clinical.clin_size_long_diam_mm,meta.clinical.diagnosis,meta.clinical.diagnosis_confirm_type,meta.clinical.family_hx_mm,meta.clinical.lesion_id,...,meta.acquisition.color_tint,meta.acquisition.dermoscopic_type,meta.acquisition.hairy,meta.acquisition.image_type,meta.acquisition.marker_pen,meta.acquisition.pixelsX,meta.acquisition.pixelsY,lesion_filename,label,is_dermoscopic
2,5c95461bdb6d7c19d0bc1345,ISIC_0057365,70.0,palms/soles,benign,,nevus,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/benign/ISIC_0057365.jpg,0,True
4,5c954626db6d7c19d0bc1487,ISIC_0057687,60.0,palms/soles,malignant,,melanoma,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/melanoma/ISIC_0057687.jpg,1,True
6,5c95462adb6d7c19d0bc1509,ISIC_0057817,75.0,palms/soles,benign,,nevus,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/benign/ISIC_0057817.jpg,0,True
7,5c95462cdb6d7c19d0bc1549,ISIC_0057881,55.0,palms/soles,malignant,,melanoma,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/melanoma/ISIC_0057881.jpg,1,True
15,5c954637db6d7c19d0bc1692,ISIC_0058210,25.0,palms/soles,benign,,nevus,,,,...,,,,,,1024,1024,datasets/processed/ISIC/benign/ISIC_0058210.jpg,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,5c954602db6d7c19d0bc1094,ISIC_0056676,45.0,palms/soles,malignant,,melanoma,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/melanoma/ISIC_0056676.jpg,1,True
88,5c954604db6d7c19d0bc10c8,ISIC_0056728,70.0,palms/soles,malignant,,melanoma,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/melanoma/ISIC_0056728.jpg,1,True
90,5c954609db6d7c19d0bc1157,ISIC_0056871,70.0,palms/soles,malignant,,melanoma,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/melanoma/ISIC_0056871.jpg,1,True
91,5c95460ddb6d7c19d0bc11ad,ISIC_0056957,85.0,palms/soles,malignant,,melanoma,histopathology,,,...,,,,,,1024,1024,datasets/processed/ISIC/melanoma/ISIC_0056957.jpg,1,True


In [60]:
metadata.lesion_filename

2       datasets/processed/ISIC/benign/ISIC_0057365.jpg
4     datasets/processed/ISIC/melanoma/ISIC_0057687.jpg
6       datasets/processed/ISIC/benign/ISIC_0057817.jpg
7     datasets/processed/ISIC/melanoma/ISIC_0057881.jpg
15      datasets/processed/ISIC/benign/ISIC_0058210.jpg
                            ...                        
86    datasets/processed/ISIC/melanoma/ISIC_0056676.jpg
88    datasets/processed/ISIC/melanoma/ISIC_0056728.jpg
90    datasets/processed/ISIC/melanoma/ISIC_0056871.jpg
91    datasets/processed/ISIC/melanoma/ISIC_0056957.jpg
96      datasets/processed/ISIC/benign/ISIC_0057098.jpg
Name: lesion_filename, Length: 149, dtype: object