In [None]:
import os
import pandas as pd

# Base path to dataset (adjust if needed)
import kagglehub
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")

# Load metadata
df = pd.read_csv(os.path.join(path, "HAM10000_metadata.csv"))
print(f"Initial number of rows: {len(df)}")

# Map dx codes to full labels (optional)
dx_map = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}
df["dx_full"] = df["dx"].map(dx_map)

# Function to resolve correct image path
def find_image_path(image_id, base_path):
    for subfolder in ["HAM10000_images_part_1", "HAM10000_images_part_2"]:
        candidate = os.path.join(base_path, subfolder, f"{image_id}.jpg")
        if os.path.exists(candidate):
            return candidate
    return None  # If image not found

# Apply function to all image_ids
df["file_path"] = df["image_id"].apply(lambda x: find_image_path(x, path))

# Drop rows without image file
df = df.dropna(subset=["file_path"])
print(f"Samples with valid images: {len(df)}")

# Save preprocessed DataFrame (optional)
os.makedirs("../outputs/data", exist_ok=True)
df.to_csv("../outputs/data/ham10000_preprocessed.csv", index=False)

print("Preprocessing complete. Data saved to ../outputs/data/ham10000_preprocessed.csv")


  from .autonotebook import tqdm as notebook_tqdm


Initial number of rows: 10015
Samples with valid images: 10015
✅ Preprocessing complete. Data saved to ../outputs/data/ham10000_preprocessed.csv
