In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from glob import glob
import os

#pytorch
#import torch
#import torchvision
#import torchvision.transforms as transforms
#import torch.nn as nn
#import torch.nn.functional as F
#import torch.optim as optim


## Read in Metadata

In [12]:
skin_df = pd.read_csv('data/HAM10000_metadata.csv')

In [13]:
skin_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


Let's take a look at the 7 types of skin lesions

In [14]:
skin_df['dx'].value_counts()

nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: dx, dtype: int64

For future processing, we need to convert our label to numbers

In [15]:
skin_df['cell_type_idx'] = pd.Categorical(metadata['dx']).codes

In [16]:
skin_df['cell_type_idx'].value_counts()

5    6705
4    1113
2    1099
1     514
0     327
6     142
3     115
Name: cell_type_idx, dtype: int64

## Read in Image

We have the data in the form of images, and we want to convert them into matrix format that works well with PyTorch

The first step is to match up the image filenames with their corresponding image id

In [11]:
data_dir = '../SkinLesionClassifier_data'
all_image_path = glob(os.path.join(data_dir, '*', '*.jpg'))
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in all_image_path}

In [18]:
skin_df['path'] = skin_df['image_id'].map(imageid_path_dict.get)

Now we want to see if `lesion_id` is unique and clean out unduplicated images

In [22]:
# this will tell us how many images are associated with each lesion_id
df_undup = skin_df.groupby('lesion_id').count()
# now we filter out lesion_id's that have only one image associated with it
df_undup = df_undup[df_undup['image_id'] == 1]
df_undup.reset_index(inplace=True)
df_undup.shape

(5514, 9)

In [51]:
unique_id = pd.Series(df_undup['lesion_id'].unique())
skin_df = skin_df[skin_df['lesion_id'].isin(unique_id)]
skin_df.shape

(5514, 9)

In [None]:
skin_df['image'] = skin_df['path'].map(lambda x: np.asarray(Image.open(x).resize((100,75))))