In [81]:
import matplotlib.pyplot as plt
from PIL import Image
import matplotlib.image as mpimg
import numpy as np
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.model_selection import train_test_split
import os, shutil
#from tensorflow.keras import models, layers
#from tensorflow.keras import optimizers

## Explore Metadata

In [38]:
meta = pd.read_csv('data/HAM10000_metadata.csv')

In [72]:
meta['dx'].value_counts()

nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: dx, dtype: int64

#### There is a class imbalance concerning diagnoses. We have to deal with that later on.

In [7]:
meta.columns

Index(['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization'], dtype='object')

### Identify lesion_IDs with multiple images for the validation set
To ensure that we don't have the same images in the train and validation set

In [39]:
uni = meta.groupby('lesion_id').count()

In [40]:
uni = uni[uni['image_id'] == 1]

In [50]:
unique_list = list(uni.index)

In [53]:
# define function to identify if a lesion id has duplicate images
def is_duplicate(x):
    
    unique_list = list(uni.index)
    
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'

In [42]:
meta['duplicate'] = meta['lesion_id']

In [54]:
meta['duplicate'] = meta['duplicate'].apply(is_duplicate)

In [58]:
no_dupl = meta[meta.duplicate == "no_duplicates"]

In [61]:
meta['duplicate'].value_counts()

no_duplicates     5514
has_duplicates    4501
Name: duplicate, dtype: int64

### Create a validation set with only non-duplicate IDs

In [63]:
y= no_dupl.dx

In [64]:
# apply the train-test-split. Here we only need Xtest. Stratified splitting ensures equal distribution of classes
_, val_df = train_test_split(no_dupl, random_state=43, stratify=y, test_size=0.20)

In [70]:
val_df['dx'].value_counts()

nv       883
bkl       88
mel       46
bcc       35
akiec     30
vasc      13
df         8
Name: dx, dtype: int64

In [76]:
val_df.shape

(1103, 8)

### Remove validation data from the training data

In [73]:
train_df = pd.concat([meta, val_df]).drop_duplicates(keep=False)

In [75]:
train_df.shape

(8912, 8)

### Create folder structure for training and validation data and load images

In [78]:
train_id = list(train_df['image_id'])
test_id = list(val_df['image_id'])

In [84]:
source_dir = "/Users/leona/PythonWork/Github_Projects/Final_Pro/data/HAM10000_images/"
dirs = []

train_dir = os.path.join(source_dir, "train")
dirs.append(train_dir)
validation_dir = os.path.join(source_dir, "validation")
dirs.append(validation_dir)