In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import pandas as pd

In [3]:
DRIVE_DIR = '/content/drive/MyDrive'
PROJECT_ROOT_DIR = f'{DRIVE_DIR}/src'
DATA_DIR = f'{PROJECT_ROOT_DIR}/data'
IMG_DIR = f'{DATA_DIR}/images'

In [4]:
FULL_COLUMN_LABELS = {
    "MEL": "Melanoma",
    "NV": "Melanocytic nevus",
    "BCC": "Basal cell carcinoma",
    "AK": "Actinic keratosis",
    "BKL": "Benign keratosis",
    "DF": "Dermatofibroma",
    "VASC": "Vascular lesion",
    "SCC": "Squamous cell carcinoma",
    "UNK": "None of the others"
  }

In [5]:
isic_original_data = pd.read_csv(f'{DATA_DIR}/ISIC_2019_Training_GroundTruth.csv')

In [6]:
isic_original_data.head(3)

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
isic_original_data.tail(3)

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
25328,ISIC_0073249,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25329,ISIC_0073251,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25330,ISIC_0073254,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
isic_original_data.describe()

Unnamed: 0,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
count,25331.0,25331.0,25331.0,25331.0,25331.0,25331.0,25331.0,25331.0,25331.0
mean,0.178516,0.50827,0.131183,0.034227,0.103588,0.009435,0.009988,0.024792,0.0
std,0.382954,0.499941,0.337607,0.181815,0.304732,0.096677,0.09944,0.155493,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [9]:
isic_original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   image   25331 non-null  object 
 1   MEL     25331 non-null  float64
 2   NV      25331 non-null  float64
 3   BCC     25331 non-null  float64
 4   AK      25331 non-null  float64
 5   BKL     25331 non-null  float64
 6   DF      25331 non-null  float64
 7   VASC    25331 non-null  float64
 8   SCC     25331 non-null  float64
 9   UNK     25331 non-null  float64
dtypes: float64(9), object(1)
memory usage: 1.9+ MB


In [10]:
isic_original_data.loc[:, ['image', 'MEL']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   image   25331 non-null  object 
 1   MEL     25331 non-null  float64
dtypes: float64(1), object(1)
memory usage: 395.9+ KB


In [11]:
(((isic_original_data.iloc[:, 1:].rename(columns=FULL_COLUMN_LABELS)) == 1.0).sum())

Melanoma                    4522
Melanocytic nevus          12875
Basal cell carcinoma        3323
Actinic keratosis            867
Benign keratosis            2624
Dermatofibroma               239
Vascular lesion              253
Squamous cell carcinoma      628
None of the others             0
dtype: int64

In [12]:
df = isic_original_data[['image', 'MEL']].copy().rename(columns={'MEL': 'label'})

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   image   25331 non-null  object 
 1   label   25331 non-null  float64
dtypes: float64(1), object(1)
memory usage: 395.9+ KB


In [14]:
df.describe()

Unnamed: 0,label
count,25331.0
mean,0.178516
std,0.382954
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [15]:
df['label']

0        0.0
1        0.0
2        1.0
3        0.0
4        1.0
        ... 
25326    0.0
25327    0.0
25328    1.0
25329    0.0
25330    0.0
Name: label, Length: 25331, dtype: float64

In [16]:
df['label'].unique().tolist()

[0.0, 1.0]

In [17]:
melanoma_samples = (((df == 1.0).values).sum())
non_melanoma_samples = (((df == 0.0).values).sum())

print('Dataframe samples')
print('------------------')
print('Melanoma:', melanoma_samples)
print('Non-melanoma:', non_melanoma_samples)

Dataframe samples
------------------
Melanoma: 4522
Non-melanoma: 20809


In [18]:
import sklearn
sklearn.__version__

'1.2.2'

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# split the data into train and temp set
train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42, shuffle=True, stratify=df['label'])

In [21]:
train_data.describe()

Unnamed: 0,label
count,17731.0
mean,0.178501
std,0.382945
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [22]:
train_data.reset_index(drop=True, inplace=True)
train_data.head(3)

Unnamed: 0,image,label
0,ISIC_0032964,0.0
1,ISIC_0027839,0.0
2,ISIC_0065174,0.0


In [23]:
train_non_melanoma_samples = (((train_data == 0.0).values).sum())
train_melanoma_samples = (((train_data == 1.0).values).sum())

print('Train samples')
print('------------------')
print('Non-melanoma:', train_non_melanoma_samples)
print('Melanoma:', train_melanoma_samples)
print('------------------')
print('Total:', train_melanoma_samples + train_non_melanoma_samples)

Train samples
------------------
Non-melanoma: 14566
Melanoma: 3165
------------------
Total: 17731


In [24]:
temp_non_melanoma_samples = (((temp_data == 0.0).values).sum())
temp_melanoma_samples = (((temp_data == 1.0).values).sum())

print('Temp data samples')
print('------------------')
print('Non-melanoma:', temp_non_melanoma_samples)
print('Melanoma:', temp_melanoma_samples)
print('------------------')
print('Total:', temp_melanoma_samples + temp_non_melanoma_samples)

Temp data samples
------------------
Non-melanoma: 6243
Melanoma: 1357
------------------
Total: 7600


In [25]:
# split the temp data into validate and test set
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True, stratify=temp_data['label'])

In [26]:
val_data.describe()

Unnamed: 0,label
count,3800.0
mean,0.178684
std,0.383138
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [27]:
val_data.reset_index(drop=True, inplace=True)
val_data.head(3)

Unnamed: 0,image,label
0,ISIC_0056796,1.0
1,ISIC_0061312,0.0
2,ISIC_0071075,0.0


In [28]:
val_non_melanoma_samples = (((val_data == 0.0).values).sum())
val_melanoma_samples = (((val_data == 1.0).values).sum())

print('Validate samples')
print('------------------')
print('Non-melanoma:', val_non_melanoma_samples)
print('Melanoma:', val_melanoma_samples)
print('------------------')
print('Total:', val_melanoma_samples + val_non_melanoma_samples)

Validate samples
------------------
Non-melanoma: 3121
Melanoma: 679
------------------
Total: 3800


In [29]:
test_data.reset_index(drop=True, inplace=True)
test_data.head(3)

Unnamed: 0,image,label
0,ISIC_0071752,0.0
1,ISIC_0027417,0.0
2,ISIC_0056047,1.0


In [30]:
test_non_melanoma_samples = (((test_data == 0.0).values).sum())
test_melanoma_samples = (((test_data == 1.0).values).sum())

print('Test samples')
print('------------------')
print('Non-melanoma:', test_non_melanoma_samples)
print('Melanoma:', test_melanoma_samples)
print('------------------')
print('Total:', test_melanoma_samples + test_non_melanoma_samples)

Test samples
------------------
Non-melanoma: 3122
Melanoma: 678
------------------
Total: 3800


In [31]:
test_data.iloc[12, 0], test_data.iloc[12, 1]

('ISIC_0033612', 1.0)

In [32]:
test_data.iloc[12][0], test_data.iloc[12][1]

('ISIC_0033612', 1.0)

In [33]:
import os

In [34]:
train_data_path = f'{DATA_DIR}/initial_datasets/train_data.csv'
if not os.path.exists(train_data_path):
  train_data.to_csv(train_data_path, sep=',', index=False)

val_data_path = f'{DATA_DIR}/initial_datasets/val_data.csv'
if not os.path.exists(val_data_path):
  val_data.to_csv(val_data_path, sep=',', index=False)

test_data_path = f'{DATA_DIR}/initial_datasets/test_data.csv'
if not os.path.exists(test_data_path):
  test_data.to_csv(test_data_path, sep=',', index=False)