In [1]:
import pandas as pd
import shutil
import os

In [2]:
SOURCE_DATA_PATH = '../datasets/raw/ddidiversedermatologyimages/'
DESTINATION_DATASET_PATH = '../datasets/processed/ddidiversedermatologyimages/'

In [3]:
if not os.path.exists(DESTINATION_DATASET_PATH):

    # if the demo_folder directory is not present
    # then create it
    os.makedirs(DESTINATION_DATASET_PATH, exist_ok=True)

if not os.path.exists(DESTINATION_DATASET_PATH + 'melanoma'):

    # if the demo_folder directory is not present
    # then create it
    os.makedirs(DESTINATION_DATASET_PATH + 'melanoma', exist_ok=True)

if not os.path.exists(DESTINATION_DATASET_PATH + 'benign'):

    # if the demo_folder directory is not present
    # then create it
    os.makedirs(DESTINATION_DATASET_PATH + 'benign', exist_ok=True)

EDA

In [5]:
df = pd.read_csv(SOURCE_DATA_PATH + 'ddi_metadata.csv')

In [6]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease
0,1,000001.png,56,True,melanoma-in-situ
1,2,000002.png,56,True,melanoma-in-situ
2,3,000003.png,56,True,mycosis-fungoides
3,4,000004.png,56,True,squamous-cell-carcinoma-in-situ
4,5,000005.png,12,True,basal-cell-carcinoma


In [8]:
df.columns

Index(['DDI_ID', 'DDI_file', 'skin_tone', 'malignant', 'disease'], dtype='object')

In [7]:
df[df.malignant].disease.unique()

array(['melanoma-in-situ', 'mycosis-fungoides',
       'squamous-cell-carcinoma-in-situ', 'basal-cell-carcinoma',
       'squamous-cell-carcinoma', 'melanoma-acral-lentiginous',
       'basal-cell-carcinoma-superficial',
       'squamous-cell-carcinoma-keratoacanthoma',
       'subcutaneous-t-cell-lymphoma', 'basal-cell-carcinoma-nodular',
       'kaposi-sarcoma', 'metastatic-carcinoma', 'melanoma',
       'atypical-spindle-cell-nevus-of-reed', 'nodular-melanoma-(nm)',
       'leukemia-cutis', 'sebaceous-carcinoma',
       'blastic-plasmacytoid-dendritic-cell-neoplasm'], dtype=object)

In [9]:
is_melanoma = df.disease.apply(lambda x: 'melanoma' in x)
df['label'] = is_melanoma.values.astype(int)

In [10]:
# removing other malignant skin lesions besides Melanoma
malignant_not_mel = df[(df.malignant == True) & (df.label == 0)].index
df.drop(index=malignant_not_mel, inplace=True)

In [12]:
df.head()

Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease,label
0,1,000001.png,56,True,melanoma-in-situ,1
1,2,000002.png,56,True,melanoma-in-situ,1
6,7,000007.png,56,True,melanoma-acral-lentiginous,1
7,8,000008.png,56,True,melanoma-in-situ,1
8,9,000009.png,56,True,melanoma-acral-lentiginous,1


In [11]:
df.shape

(506, 6)

Moving filtered data

In [13]:
benign_imgs = df[df.label == 0].DDI_file.values
mel_imgs = df[df.label == 1].DDI_file.values

In [15]:
for img_path in benign_imgs:
    shutil.copy(SOURCE_DATA_PATH + img_path, DESTINATION_DATASET_PATH + 'benign/')

for img_path in mel_imgs:
    shutil.copy(SOURCE_DATA_PATH + img_path, DESTINATION_DATASET_PATH + 'melanoma/')

In [16]:
df['is_dermoscopic'] = False
df['lesion_filename'] = df.DDI_file.apply(lambda x: DESTINATION_DATASET_PATH[3:] + '{}/'.format('melanoma' if x in mel_imgs else 'benign') + x)

In [19]:
df.sample(15)

Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease,label,is_dermoscopic,lesion_filename
570,571,000571.png,34,False,melanocytic-nevi,0,False,datasets/processed/ddidiversedermatologyimages...
623,624,000624.png,34,False,melanocytic-nevi,0,False,datasets/processed/ddidiversedermatologyimages...
355,356,000356.png,12,False,epidermal-cyst,0,False,datasets/processed/ddidiversedermatologyimages...
233,234,000234.png,56,False,lipoma,0,False,datasets/processed/ddidiversedermatologyimages...
279,280,000280.png,12,False,melanocytic-nevi,0,False,datasets/processed/ddidiversedermatologyimages...
42,43,000043.png,12,True,melanoma-acral-lentiginous,1,False,datasets/processed/ddidiversedermatologyimages...
205,206,000206.png,56,False,melanocytic-nevi,0,False,datasets/processed/ddidiversedermatologyimages...
61,62,000062.png,34,False,seborrheic-keratosis-irritated,0,False,datasets/processed/ddidiversedermatologyimages...
513,514,000514.png,34,False,angioma,0,False,datasets/processed/ddidiversedermatologyimages...
0,1,000001.png,56,True,melanoma-in-situ,1,False,datasets/processed/ddidiversedermatologyimages...


In [20]:
# save metadata
df.to_csv(DESTINATION_DATASET_PATH + 'metadata.csv', index=False)