#### 1. Check whether all the json file in the HAM10000 annotated datasets (in the Datasets folder) have all the corresponding images in the ISIC2018 datasets

In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import defaultdict

%matplotlib inline

In [4]:
# Define base paths
dataset_base_path = '../ham_concept_dataset/'
# Define the specific parent folders within 'Datasets' that contain JSON files
json_parent_folders = [
    os.path.join(dataset_base_path, 'Datasets', 'ground_truth_annotations'),
    os.path.join(dataset_base_path, 'Datasets', 'study_annotations')
]
image_folders_paths = [
    os.path.join(dataset_base_path, 'ISIC2018_Task3_Training_Input'),
]

In [5]:
image_folders_paths

['../ham_concept_dataset/ISIC2018_Task3_Training_Input']

In [6]:
json_basenames = set()
print("Scanning for JSON files in specified 'Datasets' subfolders:")
for json_root_folder in json_parent_folders:
    abs_json_root_folder = os.path.abspath(json_root_folder)
    print(f"  - Scanning under: {abs_json_root_folder}")
    if os.path.exists(json_root_folder) and os.path.isdir(json_root_folder):
        for dirpath, dirnames, filenames in os.walk(json_root_folder):
            for filename in filenames:
                if filename.endswith('.json'):
                    basename = os.path.splitext(filename)[0] # e.g., "ISIC_0033928"
                    json_basenames.add(basename)
    else:
        print(f"Warning: JSON parent folder not found at {json_root_folder}")

print(f"Found {len(json_basenames)} unique JSON basenames from all specified locations.\n")


Scanning for JSON files in specified 'Datasets' subfolders:
  - Scanning under: /home/nqmtien/THESIS/REIT4841/ham-concept/ham_concept_dataset/Datasets/ground_truth_annotations
  - Scanning under: /home/nqmtien/THESIS/REIT4841/ham-concept/ham_concept_dataset/Datasets/study_annotations
Found 3611 unique JSON basenames from all specified locations.



In [7]:
image_basenames = set()
print("Scanning for JPG files in ISIC2018 Task3 Input folders:")
for folder_path in image_folders_paths:
    abs_folder_path = os.path.abspath(folder_path)
    print(f"  - Scanning: {abs_folder_path}")
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith('.jpg'):
                basename = os.path.splitext(filename)[0] # e.g., "ISIC_0033928"
                image_basenames.add(basename)
    else:
        print(f"Warning: Image folder not found at {folder_path}")
print(f"Found {len(image_basenames)} unique JPG basenames in total.\n")


Scanning for JPG files in ISIC2018 Task3 Input folders:
  - Scanning: /home/nqmtien/THESIS/REIT4841/ham-concept/ham_concept_dataset/ISIC2018_Task3_Training_Input
Found 10015 unique JPG basenames in total.



In [8]:
missing_images_for_json = []
if not json_basenames:
    print("No JSON files found to check.")
else:
    for json_basename in json_basenames:
        if json_basename not in image_basenames:
            missing_images_for_json.append(json_basename + ".json") # Report with .json extension


In [9]:
if not json_basenames:
    # Message already printed above
    pass
elif not missing_images_for_json:
    print("All unique JSON files found in the specified 'Datasets' subfolders have a corresponding JPG image in the ISIC2018_Task3_Input folders.")
else:
    print(f"Found {len(missing_images_for_json)} unique JSON file basename(s) from the 'Datasets' subfolders without a corresponding JPG image in the ISIC2018_Task3_Input folders:")
    # To avoid ambiguity if a basename exists in multiple JSON locations but only one is missing an image,
    # we report the basenames. The script already ensures unique basenames are checked.
    for json_file_basename_with_ext in missing_images_for_json:
        print(f"  - {json_file_basename_with_ext} (basename: {os.path.splitext(json_file_basename_with_ext)[0]})")

print("\nVerification complete.")

All unique JSON files found in the specified 'Datasets' subfolders have a corresponding JPG image in the ISIC2018_Task3_Input folders.

Verification complete.


#### 2. Check mapping metadata_ground_truth and ISIC_Task3_Training

In [10]:
metadata_file_path = "../ham_concept_dataset/Datasets/metadata/metadata_ground_truth.csv"
groundtruth_file_path = "../ham_concept_dataset/ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv"

In [11]:
metadata_id_column = 'image_id'
groundtruth_id_column = 'image'

In [14]:
try:
    if not os.path.exists(metadata_file_path):
        print(f"Error: Metadata file not found at '{metadata_file_path}'")
    metadata_df = pd.read_csv(metadata_file_path)
    print(f"Successfully loaded '{metadata_file_path}'. Shape: {metadata_df.shape}")

    if metadata_id_column not in metadata_df.columns:
        print(f"Error: Column '{metadata_id_column}' not found in '{metadata_file_path}'.")
        print(f"Available columns: {metadata_df.columns.tolist()}")
    
    metadata_image_ids = set(metadata_df[metadata_id_column].astype(str).str.replace('.jpg', '', regex=False))
    print(f"Found {len(metadata_image_ids)} unique image IDs in '{metadata_file_path}' (after potential .jpg removal).")
    if not metadata_image_ids:
        print(f"Warning: No image IDs found in '{metadata_id_column}' of '{metadata_file_path}'.")

    if not os.path.exists(groundtruth_file_path):
        print(f"Error: Groundtruth file not found at '{groundtruth_file_path}'")
    groundtruth_df = pd.read_csv(groundtruth_file_path)
    print(f"Successfully loaded '{groundtruth_file_path}'. Shape: {groundtruth_df.shape}")

    if groundtruth_id_column not in groundtruth_df.columns:
        print(f"Error: Column '{groundtruth_id_column}' not found in '{groundtruth_file_path}'.")
        print(f"Available columns: {groundtruth_df.columns.tolist()}")
        print(f"Please ensure '{groundtruth_id_column}' is the correct image identifier column name for this file.")

    groundtruth_image_ids = set(groundtruth_df[groundtruth_id_column].astype(str))
    print(f"Found {len(groundtruth_image_ids)} unique image IDs in '{groundtruth_file_path}'.")
    if not groundtruth_image_ids:
        print(f"Warning: No image IDs found in '{groundtruth_id_column}' of '{groundtruth_file_path}'.")

    missing_ids = metadata_image_ids.difference(groundtruth_image_ids)

    if not missing_ids:
        print(f"\nSUCCESS: All {len(metadata_image_ids)} image_ids from '{metadata_file_path}' (column '{metadata_id_column}') "
                f"have a corresponding entry in '{groundtruth_file_path}' (column '{groundtruth_id_column}').")
    else:
        print(f"\nWARNING: {len(missing_ids)} image_id(s) from '{metadata_file_path}' (column '{metadata_id_column}') "
                f"are MISSING in '{groundtruth_file_path}' (column '{groundtruth_id_column}'):")
        # Print a few examples if the list is long
        for i, img_id in enumerate(sorted(list(missing_ids))):
            if i < 20: # Print up to 20 missing IDs
                print(f"  - {img_id}")
            elif i == 20:
                print(f"  ... and {len(missing_ids) - 20} more.")
                break
        print("\nPlease verify these IDs.")
        
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded '../ham_concept_dataset/Datasets/metadata/metadata_ground_truth.csv'. Shape: (6498, 29)
Found 3611 unique image IDs in '../ham_concept_dataset/Datasets/metadata/metadata_ground_truth.csv' (after potential .jpg removal).
Successfully loaded '../ham_concept_dataset/ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv'. Shape: (10015, 8)
Found 10015 unique image IDs in '../ham_concept_dataset/ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv'.

SUCCESS: All 3611 image_ids from '../ham_concept_dataset/Datasets/metadata/metadata_ground_truth.csv' (column 'image_id') have a corresponding entry in '../ham_concept_dataset/ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv' (column 'image').
