# Data loader 
This notebook loading images from ISIC Archive

In [1]:
import os
import csv
import json
import numpy as np
import pandas as pd

from concurrent.futures import ThreadPoolExecutor

In [2]:
# check if this notebook called from main one
try: IS_MAIN
except: IS_MAIN = False

In [3]:
# setup necessary parameters
if IS_MAIN:
    print('DataLoader mode: MAIN')
    limit = 600
    offset = 0
    
else:
    print('DataLoader mode: STANDALONE')
    limit = 500
    offset = 0

threads_timeout = 60

# Spesify output folders
meta_dir = 'data/ISIC'
img_dir = meta_dir + '/images'
mask_dir = meta_dir + '/masks'
img_info_dir = meta_dir + '/info_images'
mask_info_dir = meta_dir + '/info_masks'
images_info_fn = 'images_info'
masks_info_fn = 'masks_info'

if not os.path.exists(meta_dir): os.makedirs(meta_dir)
if not os.path.exists(img_dir): os.makedirs(img_dir)
if not os.path.exists(mask_dir): os.makedirs(mask_dir)
if not os.path.exists(img_info_dir): os.makedirs(img_info_dir)
if not os.path.exists(mask_info_dir): os.makedirs(mask_info_dir)

DataLoader mode: STANDALONE


# Make ISIC API requester 

In [4]:
'''
Make ISIC Api request class
'''
import requests

class ISICApi(object):
    def __init__(self, hostname='https://isic-archive.com',
                 username=None, password=None):
        self.baseUrl = f'{hostname}/api/v1'
        self.authToken = None

        if username is not None:
            if password is None:
                password = input(f'Password for user "{username}":')
            self.authToken = self._login(username, password)

    def _makeUrl(self, endpoint):
        return f'{self.baseUrl}/{endpoint}'

    def _login(self, username, password):
        authResponse = requests.get(
            self._makeUrl('user/authentication'),
            auth=(username, password)
        )
        if not authResponse.ok:
            raise Exception(f'Login error: {authResponse.json()["message"]}')

        authToken = authResponse.json()['authToken']['token']
        return authToken

    def get(self, endpoint):
        url = self._makeUrl(endpoint)
        headers = {'Girder-Token': self.authToken} if self.authToken else None
        return requests.get(url, headers=headers)

    def getJson(self, endpoint):
        return self.get(endpoint).json()

    def getJsonList(self, endpoint):
        endpoint += '&' if '?' in endpoint else '?'
        LIMIT = 50
        offset = 0
        while True:
            resp = self.get(
                f'{endpoint}limit={LIMIT:d}&offset={offset:d}'
            ).json()
            if not resp:
                break
            for elem in resp:
                yield elem
            offset += LIMIT

# Get list of images names

In [5]:
#Insert Username and Password Below
api = ISICApi(username="veaxvoid", password="ZXCASD123qweasd")

image_list = api.getJson('image?limit={}&offset={}&sort=name'.format(limit, offset))

print('Requested {} images names.'.format(len(image_list)))

Requested 500 images names.


In [6]:
'''
Load info for each img
'''
def load_img_info(img_id, img_name):
    image_info = api.getJson('image/{}'.format(img_id))
    
#     if image_info['dataset']['name'] != 'SONIC':
    file_path = os.path.join(img_info_dir, img_name+'.json')
    file = open(file_path, "w")
    json.dump(image_info, file)
    file.close()
    
imgs_id = [info['_id'] for info in image_list]
imgs_name = [info['name'] for info in image_list]

with ThreadPoolExecutor() as e: 
    e.map(load_img_info, imgs_id, imgs_name, timeout=threads_timeout)

print('Loaded information about {} images.'.format(len(os.listdir(img_info_dir))))

Loaded information about 497 images.


In [7]:
'''
Collect all image info from json files in to csv table
'''
# read filenames in infoimages dir
infoimgs_filenames = sorted([f for f in os.listdir(img_info_dir)])

image_details = []

for img_name in infoimgs_filenames:
    file_path = os.path.join(img_info_dir, img_name)
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    image_details += [info]

print('Number of valid images: {}.'.format(len(image_details)))

Number of valid images: 497.


In [8]:
'''
Image info to csv
'''
file_path = os.path.join(meta_dir, images_info_fn+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    field
    for image_detail in image_details
    for field in image_detail['meta']['clinical'].keys()
)

metadata_fields = ['isic_name','isic_id'] + sorted(metadata_fields) + ['dataset_name']

# Write the metadata to a CSV
with open(file_path, 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    for image_detail in image_details:
        row_dict = image_detail['meta']['clinical'].copy()
        row_dict['isic_name'] = image_detail['name']
        row_dict['isic_id'] = image_detail['_id']
        row_dict['dataset_name'] = image_detail['dataset']['name']
        csvWriter.writerow(row_dict)

print('Writing metadata to CSV: {}'.format(images_info_fn+'.csv'))

Writing metadata to CSV: images_info.csv


In [9]:
file_path = os.path.join(meta_dir, images_info_fn+'.csv')
data = pd.read_csv(file_path)

images_ids = list(data['isic_id'])
images_names = list(data['isic_name'])

if not IS_MAIN:
    print(data.tail().to_string())
    info_sum = {d:(data['diagnosis'] == d).sum() for d in data['diagnosis'].unique()}
    print('Data summary:')
    print(info_sum)

        isic_name                   isic_id  age_approx anatom_site_general benign_malignant diagnosis diagnosis_confirm_type melanocytic  sex dataset_name
492  ISIC_0000495  5436e3f4bae478396759f4b2         NaN     lower extremity           benign     nevus                    NaN        True  NaN        UDA-1
493  ISIC_0000496  5436e3f4bae478396759f4b4         NaN     posterior torso           benign     nevus                    NaN        True  NaN        UDA-1
494  ISIC_0000497  5436e3f5bae478396759f4b6         NaN     posterior torso           benign     nevus                    NaN        True  NaN        UDA-1
495  ISIC_0000498  5436e3f5bae478396759f4b8         NaN      anterior torso           benign     nevus                    NaN        True  NaN        UDA-1
496  ISIC_0000499  5436e3f5bae478396759f4ba         NaN       lateral torso           benign     nevus                    NaN        True  NaN        UDA-1
Data summary:
{'nevus': 358, 'melanoma': 137, nan: 0}


# Load images

In [10]:
def load_imgs(img_id, img_name):
    img_file = api.get('image/{}/download'.format(img_id))
    img_file.raise_for_status()
    file_path = os.path.join(img_dir, '{}.png'.format(img_name))
  
    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)
    
with ThreadPoolExecutor() as e: 
    e.map(load_imgs, images_ids, images_names, timeout=threads_timeout)

print('Load {} images.'.format(len(os.listdir(img_dir))))

Load 495 images.


In [11]:
'''
if some files don't loaded. Some threads are dying and dont load files
'''
def reload(curr_dir, all_names, prefix, loader, images_names=images_names, images_ids=images_ids):
    
    curr_files = np.array(sorted(os.listdir(curr_dir)))
    all_files = np.array([name+prefix for name in all_names])

    result_names = []
    result_ids = []

    for i in range(len(all_files)): 
        if not np.isin(all_files[i], curr_files, assume_unique=True):
            result_names += [images_names[i]]
            result_ids += [images_ids[i]]

    if len(result_names) != 0:
        with ThreadPoolExecutor() as e: e.map(loader, result_ids, result_names, timeout=threads_timeout)

    print('After reload in folder {}: {} files.'.format(curr_dir,len(os.listdir(curr_dir))))
    
reload(img_dir, images_names, '.png', load_imgs)

After reload in folder data/ISIC/images: 497 files.


# Load segmentation masks 

In [12]:
'''
Load segmentation data
'''
def load_mask_info(img_id, img_name):
    segmentation_data = api.getJson('segmentation?imageId={}'.format(img_id))
    
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, "w")
    json.dump(segmentation_data, file)
    file.close()
    
with ThreadPoolExecutor() as e: 
    e.map(load_mask_info, images_ids, images_names, timeout=threads_timeout)

print('Load {} info about masks.'.format(len(os.listdir(mask_info_dir))))

Load 494 info about masks.


In [13]:
reload(mask_info_dir, images_names, '.json', load_mask_info)

After reload in folder data/ISIC/info_masks: 497 files.


In [14]:
'''
Collect all VALID masks info from json files 
'''
segmentation_details = []

for img_name in images_names:
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    if info:
        if not info[0]['failed']:
            segmentation_details += [info[0]]
            segmentation_details[-1]['name'] = img_name

print('Number of segmentation info files: {}'.format(len(segmentation_details)))

Number of segmentation info files: 497


In [15]:
'''
Masks info to csv
'''
file_path = os.path.join(meta_dir, masks_info_fn+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    segmentation_details[0].keys()
)

metadata_fields = sorted(metadata_fields)

# Write the metadata to a CSV
with open(file_path, 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    k=-1
    for info in segmentation_details:
        k+=1
        row_dict = info.copy()
        csvWriter.writerow(row_dict)

print('Writing metadata to CSV: {}'.format(masks_info_fn+'.csv'))

Writing metadata to CSV: masks_info.csv


In [16]:
data = pd.read_csv(file_path)

masks_names = list(data['name'])
masks_id = list(data['_id'])

if not IS_MAIN:
    print(data.tail().to_string())

                          _id                           created  failed          name   skill
492  54cc02c6bae47819d8e4c9a8  2015-01-30T22:16:38.471000+00:00   False  ISIC_0000495  expert
493  544eae26bae478661558fb7e  2014-10-27T20:42:14.795000+00:00   False  ISIC_0000496  expert
494  54cc029fbae47819d8e4c99c  2015-01-30T22:15:59.142000+00:00   False  ISIC_0000497  expert
495  54cc0254bae47819d8e4c996  2015-01-30T22:14:44.642000+00:00   False  ISIC_0000498  expert
496  57c04f259fc3c158f2bd0e3b  2016-08-26T14:16:05.388000+00:00   False  ISIC_0000499  expert


In [17]:
def load_mask(mask_id, img_name):
    img_file = api.get('segmentation/{}/mask'.format(mask_id))
    img_file.raise_for_status()
    file_path = os.path.join(mask_dir, '{}_mask.png'.format(img_name))

    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)    
            
with ThreadPoolExecutor() as e: 
    e.map(load_mask, masks_id, masks_names, timeout=threads_timeout)

print('Load {} info about masks.'.format(len(os.listdir(mask_dir))))

Load 491 info about masks.


In [18]:
reload(mask_dir, masks_names, '_mask.png', load_mask, masks_names, masks_id)

After reload in folder data/ISIC/masks: 497 files.
