In [3]:
import pandas as pd

meta = pd.read_csv("HAM10000_metadata.csv")
meta.head()



Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [4]:
def age_group(age):
    if pd.isna(age): return "unknown"
    age = int(age)
    if age < 16: return "child"
    elif age < 60: return "adult"
    else: return "elderly"

meta['age_group'] = meta['age'].apply(age_group)
meta.head()


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,age_group
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,elderly
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,elderly
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,elderly
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,elderly
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,elderly


In [6]:
label_map = {
    'akiec': 'Actinic Keratoses',
    'bcc': 'Basal Cell Carcinoma',
    'bkl': 'Benign Keratosis',
    'df': 'Dermatofibroma',
    'mel': 'Melanoma',
    'nv': 'Nevus (Mole)',
    'vasc': 'Vascular Lesion'
}

meta['disease_name'] = meta['dx'].map(label_map)
meta.head()



Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,age_group,disease_name
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,elderly,Benign Keratosis
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,elderly,Benign Keratosis
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,elderly,Benign Keratosis
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,elderly,Benign Keratosis
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,elderly,Benign Keratosis


In [7]:
meta['disease_name'].value_counts()


disease_name
Nevus (Mole)            6705
Melanoma                1113
Benign Keratosis        1099
Basal Cell Carcinoma     514
Actinic Keratoses        327
Vascular Lesion          142
Dermatofibroma           115
Name: count, dtype: int64

In [8]:
mel = meta[meta['dx'] == 'mel']
mel.to_csv("melanoma_list.csv", index=False)



In [9]:
import pandas as pd
import os
from tqdm import tqdm

# load melanoma list (make sure melanoma_list.csv exists)
df = mel  # or df = pd.read_csv("melanoma_list.csv")

save_folder = "datasets/melanoma"
os.makedirs(save_folder, exist_ok=True)

dataset_name = "kmader/skin-cancer-mnist-ham10000"

for img_id in tqdm(df["image_id"]):
    # Part 1
    os.system(
        f'kaggle datasets download -d {dataset_name} -f HAM10000_images_part_1/{img_id}.jpg -p "{save_folder}"'
    )
    # Part 2
    os.system(
        f'kaggle datasets download -d {dataset_name} -f HAM10000_images_part_2/{img_id}.jpg -p "{save_folder}"'
    )




100%|██████████| 1113/1113 [45:31<00:00,  2.45s/it]   
