## Organizing the Fitzpatrick17k dataset by diagnosis labels.

The current data dump (ZIP file) contains all images in a single directory, with the MD5 hashes of the files as the filenames. As such, they are difficult to understand without the accompanying metadata CSV file.

Therefore, we organize the images in directories with each directory containing all the images of one diagnosis label.

The directories are named as `<DiagnosisLabel>_N/`, where `N` is the number of images of that `DiagnosisLabel`.

Inside each directory, the images are named as `<DiagnosisLabel>_fX_i.jpg`, where `X` is the Fitzpatrick skin tone label of the image and `i` denotes the index of the image within images of that `DiagnosisLabel` and goes from `0` to `(N - 1)`.

Since 565 images in the Fitzpatrick17k dataset do not have the Fitzpatrick skin tone labels, their corresponding labels are denoted by `0`, thus containing `f0` in the filename.

In [1]:
import os
import pandas as pd
from shutil import copyfile

In [2]:
orig_img_dir = "/local-scratch2/Datasets/Fitzpatrick17k/data/finalfitz17k/"
new_img_dir = "/local-scratch2/Datasets/Fitzpatrick17k/Categorized_AbbrvName/"

In [3]:
all_df = pd.read_csv("fitzpatrick17k.csv", index_col=0, header="infer")
all_df.head()

Unnamed: 0,md5hash,fitzpatrick,label,nine_partition_label,three_partition_label,qc,url,url_alphanum
0,5e82a45bc5d78bd24ae9202d194423f8,3,drug induced pigmentary changes,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicmminoc...
1,fa2911a9b13b6f8af79cb700937cc14f,1,photodermatoses,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicpphoto...
2,d2bac3c9e4499032ca8e9b07c7d3bc40,2,dermatofibroma,benign dermal,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicdderma...
3,0a94359e7eaacd7178e06b2823777789,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...
4,a39ec3b1f22c08a421fa20535e037bba,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...


In [4]:
all_labels = all_df["label"].unique()
all_labels

array(['drug induced pigmentary changes', 'photodermatoses',
       'dermatofibroma', 'psoriasis', 'kaposi sarcoma',
       'neutrophilic dermatoses', 'granuloma annulare',
       'nematode infection', 'allergic contact dermatitis',
       'necrobiosis lipoidica', 'hidradenitis', 'melanoma',
       'acne vulgaris', 'sarcoidosis', 'xeroderma pigmentosum',
       'actinic keratosis', 'scleroderma', 'syringoma', 'folliculitis',
       'pityriasis lichenoides chronica', 'porphyria',
       'dyshidrotic eczema', 'seborrheic dermatitis', 'prurigo nodularis',
       'acne', 'neurofibromatosis', 'eczema', 'pediculosis lids',
       'basal cell carcinoma', 'pityriasis rubra pilaris',
       'pityriasis rosea', 'livedo reticularis',
       'stevens johnson syndrome', 'erythema multiforme',
       'acrodermatitis enteropathica', 'epidermolysis bullosa',
       'dermatomyositis', 'urticaria', 'basal cell carcinoma morpheiform',
       'vitiligo', 'erythema nodosum', 'lupus erythematosus',
       '

In [5]:
abbrv_diseases = {}

abbrv_diseases["dermatofibroma"] = "defi"
abbrv_diseases["dermatomyositis"] = "demy"

abbrv_diseases["scleroderma"] = "scde"
abbrv_diseases["scabies"] = "scab"
abbrv_diseases["scleromyxedema"] = "scmy"

abbrv_diseases["neurofibromatosis"] = "nefi"
abbrv_diseases["neurodermatitis"] = "nede"

for disease in all_labels:
    if disease not in abbrv_diseases:
        abbrv_dis = ("-".join([d[:2] for d in disease.split() if d not in ["of", "and"]]))
        abbrv_diseases[disease] = abbrv_dis

assert len(abbrv_diseases) == len(all_labels)

In [6]:
all_df_extended = pd.DataFrame()

def get_orig_img_name(x):
    return f'{x["md5hash"]}.jpg'

def get_new_img_name(x):
    fitzpatrick_label_x = f"f0" if x['fitzpatrick'] == -1 else f"f{x['fitzpatrick']}"
    md5_hash_x = x["md5hash"][:8]
    return f'{abbrv_diseases[x["label"]]}_{fitzpatrick_label_x}_{str(x.name)}_{md5_hash_x}.jpg'
    # https://stackoverflow.com/a/26658301

if os.path.exists(new_img_dir):
    print("Parent directory already exits!")
else:
    os.makedirs(new_img_dir)
    
    for tmp_label in all_labels:
        tmp_df = all_df.loc[all_df["label"] == tmp_label].reset_index()
        tmp_df["orig_img_name"] = tmp_df.apply(get_orig_img_name, axis=1)
        tmp_df["new_img_name"] = tmp_df.apply(get_new_img_name, axis=1)
        
        tmp_img_dir_name = abbrv_diseases[tmp_label]
        tmp_img_dir = f'{new_img_dir}{tmp_img_dir_name}_{tmp_df.shape[0]}/'
        os.makedirs(tmp_img_dir)
        
        for _, row in tmp_df.iterrows():
            copyfile(f'{orig_img_dir}{row["orig_img_name"]}', f'{tmp_img_dir}{row["new_img_name"]}')
        
        all_df_extended = pd.concat([all_df_extended, tmp_df])
    
    all_df_extended = all_df_extended.set_index("index")
    all_df_extended = all_df_extended.sort_index()
    
    all_df_extended.to_csv("fitzpatrick17k_detailed_abbreviated_names.csv")

In [7]:
all_df_extended.head()

Unnamed: 0_level_0,md5hash,fitzpatrick,label,nine_partition_label,three_partition_label,qc,url,url_alphanum,orig_img_name,new_img_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5e82a45bc5d78bd24ae9202d194423f8,3,drug induced pigmentary changes,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicmminoc...,5e82a45bc5d78bd24ae9202d194423f8.jpg,dr-in-pi-ch_f3_0_5e82a45b.jpg
1,fa2911a9b13b6f8af79cb700937cc14f,1,photodermatoses,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicpphoto...,fa2911a9b13b6f8af79cb700937cc14f.jpg,ph_f1_0_fa2911a9.jpg
2,d2bac3c9e4499032ca8e9b07c7d3bc40,2,dermatofibroma,benign dermal,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicdderma...,d2bac3c9e4499032ca8e9b07c7d3bc40.jpg,defi_f2_0_d2bac3c9.jpg
3,0a94359e7eaacd7178e06b2823777789,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...,0a94359e7eaacd7178e06b2823777789.jpg,ps_f1_0_0a94359e.jpg
4,a39ec3b1f22c08a421fa20535e037bba,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...,a39ec3b1f22c08a421fa20535e037bba.jpg,ps_f1_1_a39ec3b1.jpg


In [8]:
all_df_extended.groupby("fitzpatrick").size()

fitzpatrick
-1     565
 1    2947
 2    4808
 3    3308
 4    2781
 5    1533
 6     635
dtype: int64