# Data collector
Data loader from ISIC

In [4]:
''' 
Load libs 
'''
import csv
import os
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm 

from concurrent.futures import ThreadPoolExecutor
n_threads = 32

# Make ISIC API requester 

In [5]:
'''
Make ISIC Api request class
'''
import requests

class ISICApi(object):
    def __init__(self, hostname='https://isic-archive.com',
                 username=None, password=None):
        self.baseUrl = f'{hostname}/api/v1'
        self.authToken = None

        if username is not None:
            if password is None:
                password = input(f'Password for user "{username}":')
            self.authToken = self._login(username, password)

    def _makeUrl(self, endpoint):
        return f'{self.baseUrl}/{endpoint}'

    def _login(self, username, password):
        authResponse = requests.get(
            self._makeUrl('user/authentication'),
            auth=(username, password)
        )
        if not authResponse.ok:
            raise Exception(f'Login error: {authResponse.json()["message"]}')

        authToken = authResponse.json()['authToken']['token']
        return authToken

    def get(self, endpoint):
        url = self._makeUrl(endpoint)
        headers = {'Girder-Token': self.authToken} if self.authToken else None
        return requests.get(url, headers=headers)

    def getJson(self, endpoint):
        return self.get(endpoint).json()

    def getJsonList(self, endpoint):
        endpoint += '&' if '?' in endpoint else '?'
        LIMIT = 50
        offset = 0
        while True:
            resp = self.get(
                f'{endpoint}limit={LIMIT:d}&offset={offset:d}'
            ).json()
            if not resp:
                break
            for elem in resp:
                yield elem
            offset += LIMIT


# Get list of images names

In [17]:
#Insert Username and Password Below
api = ISICApi(username="veaxvoid", password="ZXCASD123qweasd")

# SONIC 558 - 9867
limit = 500
offset = 9868

image_list = api.getJson('image?limit={}&offset={}&sort=name'.format(limit, offset))
print('Requested {} images names'.format(len(image_list)))

Requested 500 images names


In [18]:
'''
Spesify output folders
'''
meta_dir = 'ISIC'
img_dir = meta_dir + '/images'
mask_dir = meta_dir + '/masks'
img_info_dir = meta_dir + '/info_images'
mask_info_dir = meta_dir + '/info_masks'
imgs_filename = 'imagedata'
mask_filename = 'masksdata'

if not os.path.exists(meta_dir): os.makedirs(meta_dir)
if not os.path.exists(img_dir): os.makedirs(img_dir)
if not os.path.exists(mask_dir): os.makedirs(mask_dir)
if not os.path.exists(img_info_dir): os.makedirs(img_info_dir)
if not os.path.exists(mask_info_dir): os.makedirs(mask_info_dir)

In [19]:
'''
Load info for each img # 2
'''
# with ThreadPoolExecutor(n_threads) as executor:


'\nLoad info for each img # 2\n'

In [20]:
'''
Load info for each img
'''
def load_img_info(img_id, img_name):
    image_info = api.getJson('image/{}'.format(img_id))
    
    if image_info['dataset']['name'] != 'SONIC':
        file_path = os.path.join(img_info_dir, img_name+'.json')
        file = open(file_path, "w")
        json.dump(image_info, file)
        file.close()
    
imgs_id = [info['_id'] for info in image_list]
imgs_name = [info['name'] for info in image_list]

with ThreadPoolExecutor(n_threads) as e: 
    e.map(load_img_info, imgs_id, imgs_name, timeout=60)
print('done.')

done.


In [21]:
'''
Collect all image info from json files in to csv table
'''
# read filenames in infoimages dir
infoimgs_filenames = sorted([f for f in os.listdir(img_info_dir)])

image_details = []

for img_name in infoimgs_filenames:
    file_path = os.path.join(img_info_dir, img_name)
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    image_details += [info]

print('Number of valid images: {}'.format(len(image_details)))

Number of valid images: 500


In [22]:
'''
Image info to csv
'''
file_path = os.path.join(meta_dir, imgs_filename+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    field
    for image_detail in image_details
    for field in image_detail['meta']['clinical'].keys()
)
metadata_fields = ['isic_name','isic_id'] + sorted(metadata_fields) + ['dataset_name']

# Write the metadata to a CSV
print('Writing metadata to CSV: {}'.format(imgs_filename+'.csv'))

with open(file_path, 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    for image_detail in image_details:
        row_dict = image_detail['meta']['clinical'].copy()
        row_dict['isic_name'] = image_detail['name']
        row_dict['isic_id'] = image_detail['_id']
        row_dict['dataset_name'] = image_detail['dataset']['name']
        csvWriter.writerow(row_dict)

Writing metadata to CSV: imagedata.csv


In [23]:
file_path = os.path.join(meta_dir, imgs_filename+'.csv')
data = pd.read_csv(file_path)

images_ids = list(data['isic_id'])
images_names = list(data['isic_name'])

data.tail()

Unnamed: 0,isic_name,isic_id,age_approx,anatom_site_general,benign_malignant,clin_size_long_diam_mm,diagnosis,diagnosis_confirm_type,family_hx_mm,melanocytic,personal_hx_mm,sex,dataset_name
495,ISIC_0010363,558d62ecbae47801cf734a59,55,posterior torso,benign,15.0,nevus,single image expert consensus,False,True,False,male,MSK-2
496,ISIC_0010364,558d62edbae47801cf734a5c,40,posterior torso,malignant,7.0,melanoma,histopathology,False,True,False,male,MSK-2
497,ISIC_0010365,558d62eebae47801cf734a5f,65,posterior torso,benign,5.0,nevus,histopathology,False,True,True,female,MSK-2
498,ISIC_0010366,558d62eebae47801cf734a62,85,lower extremity,malignant,28.0,melanoma,histopathology,False,True,True,female,MSK-2
499,ISIC_0010367,558d62efbae47801cf734a65,60,lower extremity,benign,3.0,nevus,histopathology,False,True,True,female,MSK-2


In [24]:
info_sum = {d:(data['diagnosis'] == d).sum() for d in data['diagnosis'].unique()}
print('Data summary:')
info_sum

Data summary:


{'melanoma': 98,
 'nevus': 332,
 'solar lentigo': 14,
 nan: 0,
 'lentigo NOS': 26,
 'atypical melanocytic proliferation': 4,
 'lentigo simplex': 5}

# Load images

In [25]:
def load_imgs(img_id, img_name):
    img_file = api.get('image/{}/download'.format(img_id))
    img_file.raise_for_status()
    file_path = os.path.join(img_dir, '{}.png'.format(img_name))
  
    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)
    
with ThreadPoolExecutor(n_threads) as e: 
    e.map(load_imgs, images_ids, images_names, timeout=60)
print('done.')

done.


# Load segmentation masks 

In [26]:
'''
Load segmentation data
'''
def load_mask_info(img_id, img_name):
    segmentation_data = api.getJson('segmentation?imageId={}'.format(img_id))
    
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, "w")
    json.dump(segmentation_data, file)
    file.close()
    
with ThreadPoolExecutor(n_threads) as e: 
    e.map(load_mask_info, images_ids, images_names, timeout=60)
print('done.')

done.


In [27]:
'''
Collect all VALID masks info from json files 
'''
segmentation_details = []

for img_name in images_names:
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    if info:
        if not info[0]['failed']:
            segmentation_details += [info[0]]
            segmentation_details[-1]['name'] = img_name

print('Number of segmentation info files: {}'.format(len(segmentation_details)))

Number of segmentation info files: 500


In [28]:
'''
Masks info to csv
'''
file_path = os.path.join(meta_dir, mask_filename+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    segmentation_details[0].keys()
)

metadata_fields = sorted(metadata_fields)

# Write the metadata to a CSV
print('Writing metadata to CSV: {}'.format(mask_filename+'.csv'))

with open(file_path, 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    k=-1
    for info in segmentation_details:
        k+=1
        row_dict = info.copy()
        csvWriter.writerow(row_dict)


Writing metadata to CSV: masksdata.csv


In [30]:
masks_names = list(data['name'])
masks_id = list(data['_id'])

data = pd.read_csv(file_path)
data.tail()

Unnamed: 0,_id,created,failed,name,skill
495,55a911129fc3c156bd715909,2015-07-17T14:28:34.697000+00:00,False,ISIC_0010363,expert
496,55a90e5a9fc3c156bd715903,2015-07-17T14:16:58.709000+00:00,False,ISIC_0010364,expert
497,55a90e4a9fc3c156bd7158fd,2015-07-17T14:16:42.064000+00:00,False,ISIC_0010365,expert
498,55dba8329fc3c10470dbb0d7,2015-08-24T23:26:42.455000+00:00,False,ISIC_0010366,expert
499,55a90e1c9fc3c156bd7158f1,2015-07-17T14:15:56.359000+00:00,False,ISIC_0010367,expert


In [31]:
def load_mask(mask_id, img_name):
    img_file = api.get('segmentation/{}/mask'.format(mask_id))
    img_file.raise_for_status()
    file_path = os.path.join(mask_dir, '{}_mask.png'.format(img_name))

    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)    
            
with ThreadPoolExecutor(n_threads) as e: 
    e.map(load_mask, masks_id, masks_names, timeout=60)
print('done.')

done.
