In [20]:
import zipfile
import os
from shutil import copyfile
from sklearn.model_selection import train_test_split

# Load Original Data

In [21]:
!wget 'https://github.com/jkorzeniovski/projekt_zesp/raw/main/data/skin_diseases_dataset.zip'

--2023-11-28 13:59:33--  https://github.com/jkorzeniovski/projekt_zesp/raw/main/data/skin_diseases_dataset.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/jkorzeniovski/projekt_zesp/main/data/skin_diseases_dataset.zip [following]
--2023-11-28 13:59:33--  https://media.githubusercontent.com/media/jkorzeniovski/projekt_zesp/main/data/skin_diseases_dataset.zip
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 336248004 (321M) [application/zip]
Saving to: ‘skin_diseases_dataset.zip.1’


2023-11-28 14:00:03 (194 MB/s) - ‘skin_diseases_dataset.zip.1’ saved [336248004/336248004]



In [22]:
# Unzip data
zip_ref = zipfile.ZipFile('skin_diseases_dataset.zip')
zip_ref.extractall()
zip_ref.close()

In [23]:
test_path = "/content/Dataset/test"
train_path = "/content/Dataset/train"

for folder_path in [test_path, train_path]:
  print(f"{folder_path}' : {len(os.listdir(folder_path))}")

/content/Dataset/test' : 20
/content/Dataset/train' : 19


In [24]:
extra_class = set([folder for folder in os.listdir(test_path)]) - set([folder for folder in os.listdir(train_path)])
extra_class = list(extra_class)[0]
extra_class

'Bullous Disease Photos'

In [25]:
len([file for file in os.listdir(os.path.join(test_path, extra_class))])

2

We see that one class is missing in the training set but present in test set, so we will exclude it since we cannot obtain more data for it.

In [26]:
for folder_path in [test_path, train_path]:
  total_images = sum([len(files) for _,_,files in os.walk(folder_path)])
  print(f"{folder_path}' : {total_images}")

/content/Dataset/test' : 897
/content/Dataset/train' : 2609


Our test data is 0.25 of all images we have. That should suffice, so we won't be touching it, but will only split the "train" intro training set and validation set.

# Splitting the Images


In [27]:
DATASET_DIR = "/content/Dataset"
TRAIN_DIR = "/content/data/train"
VAL_DIR = "/content/data/val"
TEST_DIR = "/content/data/test"

CLASS_NAMES = sorted(os.listdir(train_path))
CLASSES_NUM = len(CLASS_NAMES)
SEED = 4

In [28]:
# Make directories
for d in ["/content/data", TRAIN_DIR, VAL_DIR, TEST_DIR]:
  os.makedirs(d, exist_ok=True)

In [29]:
# Split train into train and val
for class_name in sorted(os.listdir(test_path)):
    if class_name == extra_class:
        continue
    train_class_path = os.path.join(train_path, class_name)
    test_class_path = os.path.join(test_path, class_name)
    print(train_class_path)

    # Images in class folders.
    train_image_files = [os.path.join(train_class_path, img) for img in os.listdir(train_class_path) if img.endswith(".jpg")]
    test_files = [os.path.join(test_class_path, img) for img in os.listdir(test_class_path) if img.endswith(".jpg")]

    # Split previous train into train and val sets
    train_files, val_files = train_test_split(train_image_files, test_size=0.1, random_state=SEED)

    # Copy the files to new train/val/test dirs
    for files, cur_dir in zip([train_files, val_files, test_files],[TRAIN_DIR, VAL_DIR, TEST_DIR]):
        for file in files:
            dest = os.path.join(cur_dir, class_name, os.path.basename(file))
            os.makedirs(os.path.dirname(dest), exist_ok=True)
            copyfile(file,dest)

/content/Dataset/train/Acne and Rosacea Photos
/content/Dataset/train/Actinic Keratosis Basal Cell Carcinoma and other Malignant Lesions
/content/Dataset/train/Atopic Dermatitis Photos
/content/Dataset/train/Cellulitis Impetigo and other Bacterial Infections
/content/Dataset/train/Eczema Photos
/content/Dataset/train/Exanthems and Drug Eruptions
/content/Dataset/train/Herpes HPV and other STDs Photos
/content/Dataset/train/Light Diseases and Disorders of Pigmentation
/content/Dataset/train/Lupus and other Connective Tissue diseases
/content/Dataset/train/Melanoma Skin Cancer Nevi and Moles
/content/Dataset/train/Poison Ivy Photos and other Contact Dermatitis
/content/Dataset/train/Psoriasis pictures Lichen Planus and related diseases
/content/Dataset/train/Seborrheic Keratoses and other Benign Tumors
/content/Dataset/train/Systemic Disease
/content/Dataset/train/Tinea Ringworm Candidiasis and other Fungal Infections
/content/Dataset/train/Urticaria Hives
/content/Dataset/train/Vascular

In [30]:
# Images in each of train/val/test folders.
for folder_path in [TRAIN_DIR, VAL_DIR, TEST_DIR]:
  total_images = sum([len(files) for _,_,files in os.walk(folder_path)])
  print(f"{folder_path}' : {total_images}")

/content/data/train' : 2340
/content/data/val' : 269
/content/data/test' : 691


In [31]:
# Images in each subfolder of train/val/test
for folder_path in [TRAIN_DIR, VAL_DIR, TEST_DIR]:
  total_images = [len(files) for _,_,files in os.walk(folder_path)]
  print(f"{folder_path}' : {total_images}")

/content/data/train' : [0, 72, 90, 756, 58, 55, 223, 13, 31, 55, 18, 269, 81, 109, 72, 90, 21, 2, 289, 36]
/content/data/val' : [0, 9, 11, 84, 7, 7, 25, 2, 4, 7, 2, 30, 9, 13, 8, 10, 3, 1, 33, 4]
/content/data/test' : [0, 27, 37, 231, 14, 24, 51, 1, 2, 13, 6, 30, 25, 42, 31, 24, 5, 2, 120, 6]


# Exporting the Data

In [34]:
# Export the data to zip
def zipdir(path, ziph):
    for root, dirs, files in os.walk(path):
        for file in files:
            # Find the relative path without the top-level "content" folder
            rel_path = os.path.relpath(os.path.join(root, file), path)
            # Add only the contents of the "data" folder to the zip file
            ziph.write(os.path.join(root, file), os.path.join("data", rel_path))
zipf = zipfile.ZipFile('skin_diseases_split.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir("/content/data", zipf)
zipf.close()