In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
from pathlib import Path

from tqdm import tqdm

## Define directories for images, metadata (CSV), and target location for NPZ files.

In [None]:
images_dir = Path("/local-scratch/localhome/kabhishe/WorkingDir/HAM10000/images")
metadata_dir = Path("CSV_files")
npz_files_save_dir = Path("NPZ_files")

## Define a diagnosis-to-integer mapping.

This will be used to assign labels in the NPZ files.

In [None]:
dx_int_map = {
    "akiec": 0,
    "bcc": 1,
    "bkl": 2,
    "df": 3,
    "mel": 4,
    "nv": 5,
    "vasc": 6
}

## Define a function that, given a metadata file (CSV), creates NPZ files containing the images and the labels.

In [None]:
def dataset_to_npz(images_dir: Path, metadata_csv: Path, npz_save_dir: Path, npz_filename: str, size: int):
    
    npz_output = {}
    for split in ["train", "val", "test"]:
        npz_output[f"{split}_images"] = []
        npz_output[f"{split}_labels"] = []
        
    metadata_df = pd.read_csv(metadata_csv, header="infer")
    
    for _, row in tqdm(metadata_df.iterrows()):
        split = row["split"]
        
        image = Image.open(images_dir / (row["image_id"] + ".jpg"))
        label = row["dx"]

        resized_image = image.resize((size, size), resample=Image.BICUBIC)
        mapped_label = dx_int_map[label]

        resized_img_array = np.asarray(resized_image)
        
        npz_output[f"{split}_images"].append(resized_img_array)
        npz_output[f"{split}_labels"].append(mapped_label)
        
    for split in ["train", "val", "test"]:
        npz_output[f"{split}_images"] = np.stack(npz_output[f"{split}_images"]).astype(np.uint8)
        npz_output[f"{split}_labels"] = np.stack(npz_output[f"{split}_labels"]).reshape(-1, 1).astype(np.uint8)
        
    np.savez_compressed(
        npz_save_dir / (npz_filename + ".npz"), 
        train_images=npz_output["train_images"],
        train_labels=npz_output["train_labels"],
        val_images=npz_output["val_images"],
        val_labels=npz_output["val_labels"],
        test_images=npz_output["test_images"],
        test_labels=npz_output["test_labels"],
    )

## Create 28x28 and 224x224 versions for DermaMNIST-C.

In [None]:
dataset_to_npz(
    images_dir=images_dir, 
    metadata_csv=metadata_dir / "combined_metadata_corrected-HAM10000_corrected.csv",
    npz_save_dir=npz_files_save_dir,
    npz_filename="dermamnist_corrected_28",
    size=28
)

dataset_to_npz(
    images_dir=images_dir, 
    metadata_csv=metadata_dir / "combined_metadata_corrected-HAM10000_corrected.csv",
    npz_save_dir=npz_files_save_dir,
    npz_filename="dermamnist_corrected_224",
    size=224
)

## Create 28x28 and 224x224 versions for DermaMNIST-E.

In [None]:
dataset_to_npz(
    images_dir=images_dir, 
    metadata_csv=metadata_dir / "combined_extended.csv",
    npz_save_dir=npz_files_save_dir,
    npz_filename="dermamnist_extended_28",
    size=28
)

dataset_to_npz(
    images_dir=images_dir, 
    metadata_csv=metadata_dir / "combined_extended.csv",
    npz_save_dir=npz_files_save_dir,
    npz_filename="dermamnist_extended_224",
    size=224
)