# Data collector
Data loader from ISIC

In [10]:
''' 
Load libs 
'''
import csv
import os
import json
import numpy as np
import pandas as pd

from concurrent.futures import ThreadPoolExecutor

# Make ISIC API requester 

In [2]:
'''
Make ISIC Api request class
'''
import requests

class ISICApi(object):
    def __init__(self, hostname='https://isic-archive.com',
                 username=None, password=None):
        self.baseUrl = f'{hostname}/api/v1'
        self.authToken = None

        if username is not None:
            if password is None:
                password = input(f'Password for user "{username}":')
            self.authToken = self._login(username, password)

    def _makeUrl(self, endpoint):
        return f'{self.baseUrl}/{endpoint}'

    def _login(self, username, password):
        authResponse = requests.get(
            self._makeUrl('user/authentication'),
            auth=(username, password)
        )
        if not authResponse.ok:
            raise Exception(f'Login error: {authResponse.json()["message"]}')

        authToken = authResponse.json()['authToken']['token']
        return authToken

    def get(self, endpoint):
        url = self._makeUrl(endpoint)
        headers = {'Girder-Token': self.authToken} if self.authToken else None
        return requests.get(url, headers=headers)

    def getJson(self, endpoint):
        return self.get(endpoint).json()

    def getJsonList(self, endpoint):
        endpoint += '&' if '?' in endpoint else '?'
        LIMIT = 50
        offset = 0
        while True:
            resp = self.get(
                f'{endpoint}limit={LIMIT:d}&offset={offset:d}'
            ).json()
            if not resp:
                break
            for elem in resp:
                yield elem
            offset += LIMIT


# Get list of images names

In [3]:
#Insert Username and Password Below
api = ISICApi(username="veaxvoid", password="ZXCASD123qweasd")

limit = 2000
offset = 0

image_list = api.getJson('image?limit={}&offset={}&sort=name'.format(limit, offset))

In [4]:
'''
Spesify output folders
'''
meta_dir = 'ISIC'
img_dir = meta_dir + '/images'
mask_dir = meta_dir + '/masks'
img_info_dir = meta_dir + '/info_images'
mask_info_dir = meta_dir + '/info_masks'
imgs_filename = 'imagedata'
mask_filename = 'masksdata'

if not os.path.exists(meta_dir): os.makedirs(meta_dir)
if not os.path.exists(img_dir): os.makedirs(img_dir)
if not os.path.exists(mask_dir): os.makedirs(mask_dir)
if not os.path.exists(img_info_dir): os.makedirs(img_info_dir)
if not os.path.exists(mask_info_dir): os.makedirs(mask_info_dir)

In [None]:
'''
Load info for each img
'''
def load_img_info(img_id, img_name):
    image_info = api.getJson('image/{}'.format(img_id))
    
    file_path = os.path.join(img_info_dir, img_name+'.json')
    file = open(file_path, "w")
    json.dump(image_info, file)
    file.close()
    
imgs_id = [info['_id'] for info in image_list]
imgs_name = [info['name'] for info in image_list]

with ThreadPoolExecutor(12) as e: e.map(load_img_info, imgs_id, imgs_name)

In [None]:
'''
Collect all image info from json files in to csv table
'''
image_details = []

for img_name in imgs_name:
    file_path = os.path.join(img_info_dir, img_name+'.json')
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    image_details += [info]

In [None]:
'''
Image info to csv
'''
file_path = os.path.join(meta_dir, imgs_filename+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    field
    for image_detail in image_details
    for field in image_detail['meta']['clinical'].keys()
)
metadata_fields = ['isic_id'] + sorted(metadata_fields)

# Write the metadata to a CSV
print('Writing metadata to CSV: {}'.format(imgs_filename+'.csv'))

with open(file_path, 'a') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    for image_detail in image_details:
        row_dict = image_detail['meta']['clinical'].copy()
        row_dict['isic_id'] = image_detail['name']
        csvWriter.writerow(row_dict)

In [None]:
data = pd.read_csv(file_path)
data.tail()

# Load images

In [None]:
def load_imgs(img_id, img_name):
    img_file = api.get('image/{}/download'.format(img_id))
    img_file.raise_for_status()
    file_path = os.path.join(img_dir, '{}.png'.format(img_name))
  
    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)
    
with ThreadPoolExecutor(12) as e: e.map(load_imgs, imgs_id, imgs_name)

# Load segmentation masks 

In [None]:
# load segmentation data

def load_mask_info(img_id, img_name):
    segmentation_data = api.getJson('segmentation?imageId={}'.format(img_id))
    
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, "w")
    json.dump(segmentation_data, file)
    file.close()
    
with ThreadPoolExecutor(12) as e: e.map(load_mask_info, imgs_id, imgs_name)

In [9]:
'''
collect all masks info from json files 
'''
segmentation_details = []

for img_name in imgs_name:
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    segmentation_details += [info[0]]


In [46]:
'''
Masks info to csv
'''

file_path = os.path.join(meta_dir, mask_filename+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    segmentation_details[0].keys()
)

metadata_fields = ['isic_id'] + sorted(metadata_fields)

# Write the metadata to a CSV
print('Writing metadata to CSV: {}'.format(mask_filename+'.csv'))

with open(file_path, 'a') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    k=-1
    for info in segmentation_details:
        k+=1
        row_dict = info.copy()
        row_dict['isic_id'] = imgs_name[k]
        csvWriter.writerow(row_dict)


Writing metadata to CSV: masksdata.csv


In [54]:
data = pd.read_csv(file_path)
(data['skill'] == 'novice').sum()
# 1769 + 231 = 2000

231

In [None]:
def load_mask(mask_id, img_name):
    img_file = api.get('segmentation/{}/mask'.format(mask_id))
    img_file.raise_for_status()
    file_path = os.path.join(mask_dir, '{}_mask.png'.format(img_name))

    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)    
            
masks_id = [info['_id'] for info in segmentation_details]

with ThreadPoolExecutor(12) as e: e.map(load_mask, masks_id, imgs_name)