<a href="https://colab.research.google.com/github/josephmargaryan/Biomedical-Image-Segmentation/blob/main/ISIC_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
# Install the Kaggle library
!pip install kaggle

# Upload your Kaggle API key file if you haven't already (this will prompt you to choose a file from your local machine)
from google.colab import files
files.upload()

# Move the API key into the correct folder
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Change permissions (to avoid a warning on Kaggle tool startup)
!chmod 600 ~/.kaggle/kaggle.json




Saving kaggle.json to kaggle (1).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
# Download the data from Kaggle
!kaggle competitions download -c isic-2024-challenge

# Unzip the downloaded files (adjust the path and filenames as per your downloaded data structure)
!unzip -q isic-2024-challenge.zip -d isic-2024-challenge

# List the contents of the directory to verify the data is downloaded
!ls isic-2024-challenge


isic-2024-challenge.zip: Skipping, found more recently modified local copy (use --force to force download)
replace isic-2024-challenge/sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# Download the dataset
!kaggle datasets download -d andrewmvd/isic-2019

# Unzip the downloaded dataset
!unzip -q isic-2019.zip -d ./isic-2019


Dataset URL: https://www.kaggle.com/datasets/andrewmvd/isic-2019
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading isic-2019.zip to /content
100% 9.08G/9.10G [02:39<00:00, 78.8MB/s]
100% 9.10G/9.10G [02:39<00:00, 61.2MB/s]
unzip:  cannot find or open andrewmvd/isic-2019.zip, andrewmvd/isic-2019.zip.zip or andrewmvd/isic-2019.zip.ZIP.


In [None]:
# Download the dataset
!kaggle datasets download -d nischaydnk/isic-2020-jpg-224x224-resized

# Unzip the downloaded dataset
!unzip -q isic-2020-jpg-224x224-resized.zip -d ./isic-2020-jpg-224x224-resized


Dataset URL: https://www.kaggle.com/datasets/nischaydnk/isic-2020-jpg-224x224-resized
License(s): CC0-1.0
Downloading isic-2020-jpg-224x224-resized.zip to /content
 97% 457M/473M [00:06<00:00, 50.5MB/s]
100% 473M/473M [00:06<00:00, 76.9MB/s]


In [None]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision.transforms.v2 as v2
import math
from collections import defaultdict
import os
import h5py
import io

### Load 2019 data

In [None]:
images_2019 = sorted(glob("/content/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input/*.jpg"))
labels_2019 = pd.read_csv("/content/isic-2019/ISIC_2019_Training_GroundTruth.csv")

# Define benign and malignant categories
benign_labels = ['NV', 'BKL', 'DF', 'VASC']
malignant_labels = ['MEL', 'BCC', 'AK', 'SCC']

# Initialize the targets column with zeros
labels_2019['targets'] = 0

# Set targets to 1 for malignant cases
labels_2019.loc[labels_2019[malignant_labels].sum(axis=1) > 0, 'targets'] = 1

# Select only the necessary columns
labels_2019 = labels_2019[['image', 'targets']]

df_2019 = pd.DataFrame({'image_paths': images_2019, 'targets': labels_2019['targets']})
print(df_2019.head())

                                         image_paths  targets
0  /content/isic-2019/ISIC_2019_Training_Input/IS...        0
1  /content/isic-2019/ISIC_2019_Training_Input/IS...        0
2  /content/isic-2019/ISIC_2019_Training_Input/IS...        1
3  /content/isic-2019/ISIC_2019_Training_Input/IS...        0
4  /content/isic-2019/ISIC_2019_Training_Input/IS...        1


### Load 2020 data

In [None]:
# Load image paths using glob and ensure sorted order
image_paths_2020 = sorted(glob("/content/isic-2020-jpg-224x224-resized/train-image/image/*.jpg"))

# Load metadata
metadata_2020 = "/content/isic-2020-jpg-224x224-resized/train-metadata.csv"
metadata_2020_df = pd.read_csv(metadata_2020)

# Extract isic_id from image filenames
image_ids = [os.path.basename(path).split('.')[0] for path in image_paths_2020]

# Filter metadata to include only those with matching isic_id
filtered_metadata = metadata_2020_df[metadata_2020_df['isic_id'].isin(image_ids)]

# Ensure image paths and metadata align
aligned_image_paths = []
aligned_targets = []

for img_id in image_ids:
    if img_id in filtered_metadata['isic_id'].values:
        aligned_image_paths.append(image_paths_2020[image_ids.index(img_id)])
        aligned_targets.append(filtered_metadata[filtered_metadata['isic_id'] == img_id]['target'].values[0])

# Create DataFrame with filtered image paths, isic_id, and targets
df_2020 = pd.DataFrame({
    'image_paths': aligned_image_paths,
    'isic_id': image_ids[:len(aligned_image_paths)],  # Ensure alignment
    'targets': aligned_targets
})

# Sort by 'isic_id' and 'target' if needed
df_2020 = df_2020.sort_values(by=['isic_id', 'targets'])
df_2020 = df_2020.drop(columns=['isic_id'], axis=1)

# Display the DataFrame
print(df_2020.head())

                                          image_path       isic_id  target
0  /content/isic-2020-jpg-224x224-resized/train-i...  ISIC_0015719       0
1  /content/isic-2020-jpg-224x224-resized/train-i...  ISIC_0052212       0
2  /content/isic-2020-jpg-224x224-resized/train-i...  ISIC_0068279       0
3  /content/isic-2020-jpg-224x224-resized/train-i...  ISIC_0074268       0
4  /content/isic-2020-jpg-224x224-resized/train-i...  ISIC_0074311       0


In [None]:
metadata = pd.read_csv('/content/isic-2024-challenge/train-metadata.csv', low_memory=False)
isic_ids = metadata['isic_id'].values
targets = metadata['target'].values

# Initialize lists to store image paths and targets
image_paths = []
all_targets = []

# Open the HDF5 file
with h5py.File('/content/isic-2024-challenge/train-image.hdf5', 'r') as fp_hdf:
    for isic_id, target in zip(isic_ids, targets):
        image_data = fp_hdf[isic_id][()]

        # Save the image data as JPEG file
        image_path = f"/tmp/{isic_id}.jpg"
        with open(image_path, 'wb') as f:
            f.write(image_data)

        # Append the image path and target to lists
        image_paths.append(image_path)
        all_targets.append(target)

# Create a DataFrame with image paths and targets
df_2024 = pd.DataFrame({'image_paths': image_paths, 'targets': all_targets})
print(df_2024.head())

             image_paths  targets
0  /tmp/ISIC_0015670.jpg        0
1  /tmp/ISIC_0015845.jpg        0
2  /tmp/ISIC_0015864.jpg        0
3  /tmp/ISIC_0015902.jpg        0
4  /tmp/ISIC_0024200.jpg        0


In [None]:
merged_df = pd.concat((df_2024, df_2020, df_2019), axis=0).reset_index(drop=True)
merged_df

Unnamed: 0,image_paths,targets,image_path,isic_id,target
0,/tmp/ISIC_0015670.jpg,0.0,,,
1,/tmp/ISIC_0015845.jpg,0.0,,,
2,/tmp/ISIC_0015864.jpg,0.0,,,
3,/tmp/ISIC_0015902.jpg,0.0,,,
4,/tmp/ISIC_0024200.jpg,0.0,,,
...,...,...,...,...,...
459511,/content/isic-2019/ISIC_2019_Training_Input/IS...,1.0,,,
459512,/content/isic-2019/ISIC_2019_Training_Input/IS...,0.0,,,
459513,/content/isic-2019/ISIC_2019_Training_Input/IS...,1.0,,,
459514,/content/isic-2019/ISIC_2019_Training_Input/IS...,0.0,,,
