In [None]:
import pandas as pd

# load metadata
metadata = pd.read_csv('/content/young_metadata.csv')
print(metadata['file_path'].head(10))


0    ISIC-images/ISIC_0052259.jpg
1    ISIC-images/ISIC_0052310.jpg
2    ISIC-images/ISIC_0073521.jpg
3    ISIC-images/ISIC_0073522.jpg
4    ISIC-images/ISIC_0073642.jpg
5    ISIC-images/ISIC_0073665.jpg
6    ISIC-images/ISIC_0073674.jpg
7    ISIC-images/ISIC_0073865.jpg
8    ISIC-images/ISIC_0073975.jpg
9    ISIC-images/ISIC_0074413.jpg
Name: file_path, dtype: object


In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
from shutil import copyfile

# directories
malignant_images_dir = '/content/malignant_images'  # folder containing original malignant images
augmented_images_dir = '/content/augmented_malignant_images'  # output folder for augmented images
young_images_dir = '/content/young_images'  #directory for young images
young_metadata_path = '/content/young_metadata.csv'
augmented_images_dir = '/content/augmented_malignant_images'  # augmented images output

# ensure the augmented images directory exists
os.makedirs(augmented_images_dir, exist_ok=True)

# load metadata
import pandas as pd
young_metadata = pd.read_csv(young_metadata_path)

# define the ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# process all malignant images in the directory and generate augmentations
for img_file in os.listdir(malignant_images_dir):
    img_path = os.path.join(malignant_images_dir, img_file)

    # load and preprocess the image
    try:
        img = load_img(img_path, target_size=(224, 224))
        img_array = img_to_array(img)
        img_array = img_array.reshape((1,) + img_array.shape)
    except Exception as e:
        print(f"Error processing image {img_file}: {e}")
        continue

    # augmentation
    i = 0
    base_filename = img_file.split('.')[0]

    for batch in datagen.flow(img_array, batch_size=1, save_to_dir=augmented_images_dir,
                              save_prefix=base_filename, save_format='jpeg'):
        i += 1
        if i >= 500:  # create 500 augmentations per image
            break

print(f"Augmentation complete. Augmented images saved to: {augmented_images_dir}")


Augmentation complete. Augmented images saved to: /content/augmented_malignant_images


In [None]:
# check number of augmented images created
augmented_images = os.listdir(augmented_images_dir)
print(f"Number of augmented malignant images: {len(augmented_images)}")

Number of augmented malignant images: 11697


In [None]:
import os

# directory with augmented images
augmented_images_dir = '/content/augmented_malignant_images'
augmented_images = os.listdir(augmented_images_dir)

print(f"Total Augmented Images: {len(augmented_images)}")
for filename in augmented_images[:10]:
    print(filename)


Total Augmented Images: 11697
ISIC_7464445_0_3770.jpeg
ISIC_0275647_0_6131.jpeg
ISIC_7464445_0_1469.jpeg
ISIC_6730784_0_4841.jpeg
ISIC_6730784_0_3075.jpeg
ISIC_6730784_0_9195.jpeg
ISIC_8569576_0_1921.jpeg
ISIC_3951022_0_5757.jpeg
ISIC_4927574_0_9009.jpeg
ISIC_7413005_0_575.jpeg


In [None]:
from collections import defaultdict

grouped_files = defaultdict(list)

for filename in augmented_images:
    original_prefix = filename.split('_0_')[0]
    grouped_files[original_prefix].append(filename)

# print the number of augmentations for each original image
for prefix, files in grouped_files.items():
    print(f"{prefix}: {len(files)} augmentations")


ISIC_7464445: 481 augmentations
ISIC_0275647: 478 augmentations
ISIC_6730784: 489 augmentations
ISIC_8569576: 488 augmentations
ISIC_3951022: 486 augmentations
ISIC_4927574: 491 augmentations
ISIC_7413005: 487 augmentations
ISIC_4883996: 489 augmentations
ISIC_1721956: 490 augmentations
ISIC_3581982: 489 augmentations
ISIC_4629159: 485 augmentations
ISIC_9726753: 490 augmentations
ISIC_5221028: 490 augmentations
ISIC_0848536: 488 augmentations
ISIC_7285904: 485 augmentations
ISIC_1645097: 482 augmentations
ISIC_8336318: 479 augmentations
ISIC_7160427: 483 augmentations
ISIC_3164288: 491 augmentations
ISIC_2480600: 491 augmentations
ISIC_0554252: 494 augmentations
ISIC_5956582: 491 augmentations
ISIC_0279372: 488 augmentations
ISIC_6025411: 492 augmentations


In [None]:
import pandas as pd

# load young metadata
metadata_path = '/content/young_metadata.csv'
metadata = pd.read_csv(metadata_path)

# add augmented images to metadata
augmented_entries = []
for prefix, files in grouped_files.items():
    for file in files:
        augmented_entries.append({
            'filename': file,
            'benign_malignant': 'malignant',
            'parent_image': f"{prefix}.jpg"
        })

# append to existing metadata
augmented_df = pd.DataFrame(augmented_entries)
metadata = pd.concat([metadata, augmented_df], ignore_index=True)

updated_metadata_path = '/content/updated_metadata.csv'
metadata.to_csv(updated_metadata_path, index=False)
print(f"Updated metadata saved to: {updated_metadata_path}")


Updated metadata saved to: /content/updated_metadata.csv


In [None]:
updated_df = pd.read_csv(updated_metadata_path)
updated_df.head()

Unnamed: 0,isic_id,age_approx,anatom_site_general,benign_malignant,file_path,target,filename,parent_image
0,ISIC_0052259,45.0,anterior torso,benign,ISIC-images/ISIC_0052259.jpg,0.0,,
1,ISIC_0052310,45.0,posterior torso,benign,ISIC-images/ISIC_0052310.jpg,0.0,,
2,ISIC_0073521,40.0,anterior torso,benign,ISIC-images/ISIC_0073521.jpg,0.0,,
3,ISIC_0073522,30.0,anterior torso,benign,ISIC-images/ISIC_0073522.jpg,0.0,,
4,ISIC_0073642,45.0,upper extremity,benign,ISIC-images/ISIC_0073642.jpg,0.0,,


In [None]:
updated_df['benign_malignant'].value_counts()

Unnamed: 0_level_0,count
benign_malignant,Unnamed: 1_level_1
benign,42257
malignant,11721
indeterminate,12


In [None]:
import zipfile

zip_path = '/content/young_images.zip'
extraction_dir = '/content/young_images'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

print(f"Extracted images to: {extraction_dir}")


Extracted images to: /content/young_images


In [None]:
import os
import shutil

# directories
original_images_dir = '/content/young_images'  # extracted original images
augmented_images_dir = '/content/augmented_malignant_images'  # augmented malignant images
combined_dir = '/content/combined_images'  # directory for combined dataset

# ensure the combined directory exists
os.makedirs(combined_dir, exist_ok=True)

# copy original images to the combined directory
for img_file in os.listdir(original_images_dir):
    src_path = os.path.join(original_images_dir, img_file)
    dst_path = os.path.join(combined_dir, img_file)
    shutil.copy(src_path, dst_path)

# copy augmented images to the combined directory
for img_file in os.listdir(augmented_images_dir):
    src_path = os.path.join(augmented_images_dir, img_file)
    dst_path = os.path.join(combined_dir, img_file)
    shutil.copy(src_path, dst_path)

print(f"Combined dataset saved to: {combined_dir}")


Combined dataset saved to: /content/combined_images


In [None]:
# count files in the combined directory
total_files = len(os.listdir(combined_dir))
print(f"Total images in combined dataset: {total_files}")


Total images in combined dataset: 53990


In [None]:
import shutil

combined_zip_path = '/content/combined_images.zip'

shutil.make_archive(combined_zip_path.replace('.zip', ''), 'zip', combined_dir)

print(f"Combined dataset compressed to: {combined_zip_path}")


Combined dataset compressed to: /content/combined_images.zip


In [None]:
import os
import zipfile
from PIL import Image
import pandas as pd

# paths
zip_file_path = "/content/old_images.zip"
csv_path = "/content/old_metadata.csv"
malignant_dir = "/content/malignant_images"

# create the directory if it doesn't exist
if not os.path.exists(malignant_dir):
    os.makedirs(malignant_dir)

# load metadata
metadata = pd.read_csv(csv_path)

# extract malignant file names (strip "ISIC-images/" prefix)
malignant_files = metadata[metadata['benign_malignant'] == 'malignant']['file_path'].str.split('/').str[-1].tolist()

# extract images from the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_files = zip_ref.namelist()
    for file in malignant_files:
        if file in zip_files:
            with zip_ref.open(file) as image_file:
                image = Image.open(image_file)
                image.save(os.path.join(malignant_dir, file))

extracted_files = os.listdir(malignant_dir)
print(f"Extracted {len(extracted_files)} malignant images.")

Extracted 217 malignant images.


In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
from shutil import copyfile

# directories
malignant_images_dir = '/content/malignant_images'  # folder containing original malignant images
augmented_images_dir = '/content/augmented_malignant_images'  # output folder for augmented images
old_images_dir = '/content/old_images'  #directory for old images
old_metadata_path = '/content/old_metadata.csv'
augmented_images_dir = '/content/augmented_malignant_images'  # augmented images output

# ensure the augmented images directory exists
os.makedirs(augmented_images_dir, exist_ok=True)

# load metadata
import pandas as pd
old_metadata = pd.read_csv(old_metadata_path)

# define the ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# generate augmentations
for img_file in os.listdir(malignant_images_dir):
    img_path = os.path.join(malignant_images_dir, img_file)

    # load and preprocess the image
    try:
        img = load_img(img_path, target_size=(224, 224))
        img_array = img_to_array(img)
        img_array = img_array.reshape((1,) + img_array.shape)
    except Exception as e:
        print(f"Error processing image {img_file}: {e}")
        continue

    # augmentation
    i = 0
    base_filename = img_file.split('.')[0]

    for batch in datagen.flow(img_array, batch_size=1, save_to_dir=augmented_images_dir,
                              save_prefix=base_filename, save_format='jpeg'):
        i += 1
        if i >= 100:  # create 100 augmentations per image
            break

print(f"Augmentation complete. Augmented images saved to: {augmented_images_dir}")


Augmentation complete. Augmented images saved to: /content/augmented_malignant_images


In [None]:
# check number of images created
augmented_images = os.listdir(augmented_images_dir)
print(f"Number of augmented malignant images: {len(augmented_images)}")

Number of augmented malignant images: 21608


In [None]:
import os

augmented_images_dir = '/content/augmented_malignant_images'

augmented_images = os.listdir(augmented_images_dir)

print(f"Total Augmented Images: {len(augmented_images)}")
for filename in augmented_images[:10]:
    print(filename)

Total Augmented Images: 21608
ISIC_9090322_0_3873.jpeg
ISIC_0330452_0_18.jpeg
ISIC_3201295_0_7943.jpeg
ISIC_2235682_0_5298.jpeg
ISIC_8520464_0_3667.jpeg
ISIC_9127831_0_8293.jpeg
ISIC_3467026_0_6699.jpeg
ISIC_0293670_0_2679.jpeg
ISIC_1540514_0_6599.jpeg
ISIC_6166598_0_9414.jpeg


In [None]:
from collections import defaultdict

grouped_files = defaultdict(list)

for filename in augmented_images:
    original_prefix = filename.split('_0_')[0]
    grouped_files[original_prefix].append(filename)

# print the number of augmentations for each original image
for prefix, files in grouped_files.items():
    print(f"{prefix}: {len(files)} augmentations")

ISIC_9090322: 100 augmentations
ISIC_0330452: 98 augmentations
ISIC_3201295: 100 augmentations
ISIC_2235682: 99 augmentations
ISIC_8520464: 100 augmentations
ISIC_9127831: 100 augmentations
ISIC_3467026: 100 augmentations
ISIC_0293670: 100 augmentations
ISIC_1540514: 100 augmentations
ISIC_6166598: 100 augmentations
ISIC_7379567: 99 augmentations
ISIC_2138706: 100 augmentations
ISIC_0157834: 98 augmentations
ISIC_7400037: 99 augmentations
ISIC_6001603: 100 augmentations
ISIC_0918465: 99 augmentations
ISIC_9690994: 100 augmentations
ISIC_1480614: 100 augmentations
ISIC_3091180: 99 augmentations
ISIC_3646371: 99 augmentations
ISIC_6465391: 100 augmentations
ISIC_7499766: 100 augmentations
ISIC_5542071: 100 augmentations
ISIC_7026299: 100 augmentations
ISIC_2764227: 100 augmentations
ISIC_5419141: 100 augmentations
ISIC_3721080: 100 augmentations
ISIC_2396582: 100 augmentations
ISIC_8852993: 100 augmentations
ISIC_8899671: 100 augmentations
ISIC_4425681: 99 augmentations
ISIC_5039397: 100

In [None]:
import pandas as pd

# load metadata
metadata_path = '/content/old_metadata.csv'
metadata = pd.read_csv(metadata_path)

# add augmented images to metadata
augmented_entries = []
for prefix, files in grouped_files.items():
    for file in files:
        augmented_entries.append({
            'filename': file,
            'benign_malignant': 'malignant',
            'parent_image': f"{prefix}.jpg"
        })

# append to existing metadata
augmented_df = pd.DataFrame(augmented_entries)
metadata = pd.concat([metadata, augmented_df], ignore_index=True)

updated_metadata_path = '/content/updated_metadata.csv'
metadata.to_csv(updated_metadata_path, index=False)
print(f"Updated metadata saved to: {updated_metadata_path}")

Updated metadata saved to: /content/updated_metadata.csv


In [None]:
updated_df = pd.read_csv(updated_metadata_path)
updated_df.head()

  updated_df = pd.read_csv(updated_metadata_path)


Unnamed: 0,isic_id,age_approx,anatom_site_general,benign_malignant,file_path,target,filename,parent_image
0,ISIC_0015670,60.0,lower extremity,benign,ISIC-images/ISIC_0015670.jpg,0.0,,
1,ISIC_0015845,60.0,head/neck,benign,ISIC-images/ISIC_0015845.jpg,0.0,,
2,ISIC_0015864,60.0,posterior torso,benign,ISIC-images/ISIC_0015864.jpg,0.0,,
3,ISIC_0015902,65.0,anterior torso,benign,ISIC-images/ISIC_0015902.jpg,0.0,,
4,ISIC_0024200,55.0,anterior torso,benign,ISIC-images/ISIC_0024200.jpg,0.0,,


In [None]:
updated_df['benign_malignant'].value_counts()

Unnamed: 0_level_0,count
benign_malignant,Unnamed: 1_level_1
benign,194772
malignant,21825
indeterminate,20


In [None]:
import zipfile

zip_path = '/content/old_images.zip'
extraction_dir = '/content/old_images'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

print(f"Extracted images to: {extraction_dir}")

Extracted images to: /content/old_images


In [None]:
import os
import shutil

# directories
original_images_dir = '/content/old_images'  # extracted original images
augmented_images_dir = '/content/augmented_malignant_images'  # augmented malignant images
combined_dir = '/content/combined_images'  # directory for combined dataset

# ensure the combined directory exists
os.makedirs(combined_dir, exist_ok=True)

# copy original images to the combined directory
for img_file in os.listdir(original_images_dir):
    src_path = os.path.join(original_images_dir, img_file)
    dst_path = os.path.join(combined_dir, img_file)
    shutil.copy(src_path, dst_path)

# copy augmented images to the combined directory
for img_file in os.listdir(augmented_images_dir):
    src_path = os.path.join(augmented_images_dir, img_file)
    dst_path = os.path.join(combined_dir, img_file)
    shutil.copy(src_path, dst_path)

print(f"Combined dataset saved to: {combined_dir}")

Combined dataset saved to: /content/combined_images


In [None]:
# count files in the combined directory
total_files = len(os.listdir(combined_dir))
print(f"Total images in combined dataset: {total_files}")

Total images in combined dataset: 216617


In [None]:
import shutil

combined_zip_path = '/content/combined_images.zip'

shutil.make_archive(combined_zip_path.replace('.zip', ''), 'zip', combined_dir)

print(f"Combined dataset compressed to: {combined_zip_path}")

Combined dataset compressed to: /content/combined_images.zip
