In [0]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import cv2 as cv
import os



In [0]:
google_drive_dir = 'drive/My Drive/'
dataset_dir = 'skin-cancer-mnist-ham10000/'
metadata_csv_name = 'HAM10000_metadata.csv'
metadata_cleared_csv_name = 'HAM10000_metadata_preprocessed.csv'
isColab = False

### Launch this if you are working at Google Colab

In [0]:
isColab = True

### Get full path to dataset

In [174]:
if isColab:
  dataset_path = google_drive_dir + dataset_dir
else:
  dataset_path = dataset_dir

dataset_path

'drive/My Drive/skin-cancer-mnist-ham10000/'

### Read dataset

In [175]:
skin_df = pd.read_csv(dataset_path + metadata_csv_name)

skin_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [176]:
skin_df.dtypes

lesion_id        object
image_id         object
dx               object
dx_type          object
age             float64
sex              object
localization     object
dtype: object

## Data Cleaning

In [177]:
skin_df.isnull().sum()

lesion_id        0
image_id         0
dx               0
dx_type          0
age             57
sex              0
localization     0
dtype: int64

In [0]:
skin_df['age'].replace(to_replace=0, value=int(skin_df['age'].mean()))
skin_df['age'].fillna(int((skin_df['age'].mean())), inplace=True)

In [179]:
skin_df['age'].value_counts()

45.0    1299
50.0    1187
55.0    1009
40.0     985
60.0     803
70.0     756
35.0     753
65.0     731
75.0     618
30.0     464
80.0     404
85.0     290
25.0     247
20.0     169
5.0       86
15.0      77
51.0      57
10.0      41
0.0       39
Name: age, dtype: int64

In [180]:
skin_df.isnull().sum()

lesion_id       0
image_id        0
dx              0
dx_type         0
age             0
sex             0
localization    0
dtype: int64

## Encode categorical data

### Prepare One-Hot Encoded Data

In [191]:
# Diseases (dx) One-Hot DataFrame
dx_onehot = pd.get_dummies(skin_df['dx'], prefix='dx')

# Diseases Types (dx_codes) One-Hot DataFrame
dx_type_onehot = pd.get_dummies(skin_df['dx_type'], prefix='dx_type')

# Sex One-Hot DataFrame
sex_onehot = pd.get_dummies(skin_df['sex'], prefix='sex')

# Disease Localization One-Hot DataFrame
localization_onehot = pd.get_dummies(skin_df['localization'], prefix='localization')

# Create List of New DataFrames to Concatenate Them
data_frames_list = [
    skin_df,
    dx_onehot,
    dx_type_onehot,
    sex_onehot,
    localization_onehot
]

# Final DataFrame
df_preprocessed = pd.concat(data_frames_list, axis=1)
df_preprocessed.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dx_akiec,dx_bcc,dx_bkl,dx_df,dx_mel,dx_nv,dx_vasc,dx_type_confocal,dx_type_consensus,dx_type_follow_up,dx_type_histo,sex_female,sex_male,sex_unknown,localization_abdomen,localization_acral,localization_back,localization_chest,localization_ear,localization_face,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [0]:
df_preprocessed.to_csv(dataset_path + metadata_cleared_csv_name)

### Just to check if everything is OK

In [195]:
cleared_df = pd.read_csv(dataset_path + metadata_cleared_csv_name)
cleared_df.head()

Unnamed: 0.1,Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dx_akiec,dx_bcc,dx_bkl,dx_df,dx_mel,dx_nv,dx_vasc,dx_type_confocal,dx_type_consensus,dx_type_follow_up,dx_type_histo,sex_female,sex_male,sex_unknown,localization_abdomen,localization_acral,localization_back,localization_chest,localization_ear,localization_face,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity
0,0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [170]:
cleared_df.dtypes

Unnamed: 0              int64
lesion_id              object
image_id               object
dx                     object
dx_type                object
age                   float64
sex                    object
localization           object
dx_codes                int64
dx_type_codes           int64
sex_codes               int64
localization_codes      int64
dtype: object