In [1]:
import os
from glob import glob

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from src.dataset import utils as dutils

%load_ext autoreload
%autoreload 2

## Unpacking the Dataset

In [2]:
DATA_DIR = 'data'

lesion_type = {
    'akiec': 'Actinic keratoses',
    'bcc': 'Basal cell carcinoma',
    'bkl': 'Benign keratosis-like lesions ',
    'df': 'Dermatofibroma',
    'mel': 'dermatofibroma',
    'nv': 'Melanocytic nevi',
    'vasc': 'Vascular lesions',
}

image_paths = glob(os.path.join(DATA_DIR, '*', '*.jpg'))
image_paths_dict_map = {os.path.splitext(os.path.basename(p))[0]: p for p in image_paths}

df = pd.read_csv(os.path.join(DATA_DIR, 'HAM10000_metadata'))

df['path'] = df['image_id'].map(image_paths_dict_map.get)
df['lesion_type'] = df['dx'].map(lesion_type.get)
df['lesion_type_id'] = pd.Categorical(df['dx']).codes

df.tail()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,path,lesion_type,lesion_type_id
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern,data/HAM10000_images_part_2/ISIC_0033084.jpg,Actinic keratoses,0
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern,data/HAM10000_images_part_2/ISIC_0033550.jpg,Actinic keratoses,0
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern,data/HAM10000_images_part_2/ISIC_0033536.jpg,Actinic keratoses,0
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern,data/HAM10000_images_part_2/ISIC_0032854.jpg,Actinic keratoses,0
10014,HAM_0003521,ISIC_0032258,mel,histo,70.0,female,back,vidir_modern,data/HAM10000_images_part_2/ISIC_0032258.jpg,dermatofibroma,4


## Handling Duplicated Lesion Images

This dataset contains duplicated images (the same lesion may have more than one image associated to it).
Since duplicated samples have no use in the model evaluation, we are getting rid of them for the
validation and test sets.

In [3]:
images_per_lesion_id = df.groupby(['lesion_id'], as_index=False)['image_id'].count()
duplicated_images = images_per_lesion_id.query('image_id > 1')['lesion_id'].values
df['duplicated'] = df['lesion_id'].isin(duplicated_images)

unduplicated = df.query('duplicated == False')
_, df_evaluation = train_test_split(unduplicated, test_size=0.2, stratify=unduplicated['lesion_type_id'])

df_train = df.query("image_id not in @df_evaluation['image_id']").copy()

# this evaluatino "placeholder" is now split into the validation and test sets
df_valid, df_test = train_test_split(df_evaluation, test_size=0.5, stratify=df_evaluation['lesion_type_id'])

## Rebalancing classes for the training set

Fixing unbalanced classes in training by augmenting (in this case copying)
each image in the respective class by the proportion indicated in `correction_proportions`.
The ideia here is to replicate less frequent class samples until the number of
samples per class is nearly the same.

In [4]:
class_counts = df_train['lesion_type_id'].value_counts()
class_rebalance_proportions = (
    (1 / (class_counts / class_counts.max()) - 1)
    .astype(int)
    .to_dict() # initially this is returned as a pandas zseries
)

print("training set before augmentation: ", df_train.shape)

for lesion_type_id, proportion in class_rebalance_proportions.items():
    if proportion > 0:
        repeated_rows = pd.concat([df_train.query('lesion_type_id == @lesion_type_id')] * proportion)
        df_train = pd.concat([df_train, repeated_rows], ignore_index=True)

print("training set after augmentation: ", df_train.shape)

training set before augmentation:  (8912, 12)
training set after augmentation:  (39186, 12)


In [5]:
df_train['lesion_type_id'].value_counts()

lesion_type_id
5    5822
6    5805
3    5778
1    5748
0    5643
4    5335
2    5055
Name: count, dtype: int64

In [6]:
df_valid['lesion_type_id'].value_counts()

lesion_type_id
5    441
2     44
4     23
1     17
0     15
6      7
3      4
Name: count, dtype: int64

In [7]:
df_test['lesion_type_id'].value_counts()

lesion_type_id
5    442
2     44
4     23
1     18
0     15
6      6
3      4
Name: count, dtype: int64

In [8]:
if not os.path.exists('artifacts'):
    os.mkdir('./artifacts')

df_train.to_csv(os.path.join('artifacts', 'df_train.csv'), index=False)
df_valid.to_csv(os.path.join('artifacts', 'df_valid.csv'), index=False)
df_test.to_csv(os.path.join('artifacts', 'df_test.csv'), index=False)