# Data collector
Data loader from ISIC

In [2]:
''' 
Load libs 
'''
import csv
import os
import json
import numpy as np
import pandas as pd

from concurrent.futures import ThreadPoolExecutor

# Make ISIC API requester 

In [3]:
'''
Make ISIC Api request class
'''
import requests

class ISICApi(object):
    def __init__(self, hostname='https://isic-archive.com',
                 username=None, password=None):
        self.baseUrl = f'{hostname}/api/v1'
        self.authToken = None

        if username is not None:
            if password is None:
                password = input(f'Password for user "{username}":')
            self.authToken = self._login(username, password)

    def _makeUrl(self, endpoint):
        return f'{self.baseUrl}/{endpoint}'

    def _login(self, username, password):
        authResponse = requests.get(
            self._makeUrl('user/authentication'),
            auth=(username, password)
        )
        if not authResponse.ok:
            raise Exception(f'Login error: {authResponse.json()["message"]}')

        authToken = authResponse.json()['authToken']['token']
        return authToken

    def get(self, endpoint):
        url = self._makeUrl(endpoint)
        headers = {'Girder-Token': self.authToken} if self.authToken else None
        return requests.get(url, headers=headers)

    def getJson(self, endpoint):
        return self.get(endpoint).json()

    def getJsonList(self, endpoint):
        endpoint += '&' if '?' in endpoint else '?'
        LIMIT = 50
        offset = 0
        while True:
            resp = self.get(
                f'{endpoint}limit={LIMIT:d}&offset={offset:d}'
            ).json()
            if not resp:
                break
            for elem in resp:
                yield elem
            offset += LIMIT


# Get list of images names

In [4]:
#Insert Username and Password Below
api = ISICApi(username="veaxvoid", password="ZXCASD123qweasd")

limit = 24000
offset = 0

image_list = api.getJson('image?limit={}&offset={}&sort=name'.format(limit, offset))
print('Requested {} images names'.format(len(image_list)))

Requested 23906 images names


In [6]:
'''
Spesify output folders
'''
meta_dir = 'ISIC'
img_dir = meta_dir + '/images'
mask_dir = meta_dir + '/masks'
img_info_dir = meta_dir + '/info_images'
mask_info_dir = meta_dir + '/info_masks'
imgs_filename = 'imagedata'
mask_filename = 'masksdata'

if not os.path.exists(meta_dir): os.makedirs(meta_dir)
if not os.path.exists(img_dir): os.makedirs(img_dir)
if not os.path.exists(mask_dir): os.makedirs(mask_dir)
if not os.path.exists(img_info_dir): os.makedirs(img_info_dir)
if not os.path.exists(mask_info_dir): os.makedirs(mask_info_dir)

In [5]:
'''
Load info for each img
'''
def load_img_info(img_id, img_name):
    image_info = api.getJson('image/{}'.format(img_id))
    
    if image_info['dataset']['name'] != 'SONIC':
        file_path = os.path.join(img_info_dir, img_name+'.json')
        file = open(file_path, "w")
        json.dump(image_info, file)
        file.close()
    
imgs_id = [info['_id'] for info in image_list]
imgs_name = [info['name'] for info in image_list]

with ThreadPoolExecutor(12) as e: e.map(load_img_info, imgs_id, imgs_name)

In [6]:
'''
Collect all image info from json files in to csv table
'''
# read filenames in infoimages dir
infoimgs_filenames = sorted([f for f in os.listdir(img_info_dir)])

image_details = []

for img_name in infoimgs_filenames:
    file_path = os.path.join(img_info_dir, img_name)
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    image_details += [info]

print('Number of valid images: {}'.format(len(image_details)))

Number of valid images: 14655


In [7]:
'''
Image info to csv
'''
file_path = os.path.join(meta_dir, imgs_filename+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    field
    for image_detail in image_details
    for field in image_detail['meta']['clinical'].keys()
)
metadata_fields = ['isic_name','isic_id'] + sorted(metadata_fields) + ['dataset_name']

# Write the metadata to a CSV
print('Writing metadata to CSV: {}'.format(imgs_filename+'.csv'))

with open(file_path, 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    for image_detail in image_details:
        row_dict = image_detail['meta']['clinical'].copy()
        row_dict['isic_name'] = image_detail['name']
        row_dict['isic_id'] = image_detail['_id']
        row_dict['dataset_name'] = image_detail['dataset']['name']
        csvWriter.writerow(row_dict)

Writing metadata to CSV: imagedata.csv


In [7]:
file_path = os.path.join(meta_dir, imgs_filename+'.csv')
data = pd.read_csv(file_path)

images_ids = list(data['isic_id'])
images_names = list(data['isic_name'])

data.tail()

Unnamed: 0,isic_name,isic_id,age_approx,anatom_site_general,benign_malignant,clin_size_long_diam_mm,diagnosis,diagnosis_confirm_type,family_hx_mm,mel_class,mel_thick_mm,melanocytic,nevus_type,personal_hx_mm,sex,dataset_name
14650,ISIC_0034317,5aaf162711659769136471fb,70.0,lower extremity,malignant,,melanoma,histopathology,,,,True,,,female,HAM10000
14651,ISIC_0034318,5aaf16271165976913647206,55.0,,,,pigmented benign keratosis,histopathology,,,,False,,,male,HAM10000
14652,ISIC_0034319,5aaf16271165976913647214,30.0,,benign,,nevus,histopathology,,,,True,,,male,HAM10000
14653,ISIC_0034320,5aaf16271165976913647223,25.0,anterior torso,benign,,nevus,histopathology,,,,True,,,female,HAM10000
14654,ISIC_0036065,5ab2d0441165976819ab1566,80.0,,malignant,,melanoma,histopathology,,melanoma in situ,0.0,True,,,male,2018 JID Editorial Images


In [8]:
info_sum = {d:(data['diagnosis'] == d).sum() for d in data['diagnosis'].unique()}
print('Data summary:')
info_sum

Data summary:


{'nevus': 9315,
 'melanoma': 2169,
 nan: 0,
 'seborrheic keratosis': 419,
 'lichenoid keratosis': 1,
 'dermatofibroma': 122,
 'angioma': 15,
 'basal cell carcinoma': 587,
 'solar lentigo': 57,
 'lentigo NOS': 71,
 'atypical melanocytic proliferation': 13,
 'lentigo simplex': 27,
 'actinic keratosis': 132,
 'other': 10,
 'angiofibroma or fibrous papule': 1,
 'squamous cell carcinoma': 225,
 'scar': 1,
 'pigmented benign keratosis': 1099,
 'vascular lesion': 142}

# Load images

In [9]:
def load_imgs(img_id, img_name):
    img_file = api.get('image/{}/download'.format(img_id))
    img_file.raise_for_status()
    file_path = os.path.join(img_dir, '{}.png'.format(img_name))
  
    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)
    
with ThreadPoolExecutor(12) as e: e.map(load_imgs, images_ids, images_names)

# Load segmentation masks 

In [None]:
# load segmentation data

def load_mask_info(img_id, img_name):
    segmentation_data = api.getJson('segmentation?imageId={}'.format(img_id))
    
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, "w")
    json.dump(segmentation_data, file)
    file.close()
    
with ThreadPoolExecutor(12) as e: e.map(load_mask_info, images_ids, images_names)

In [34]:
'''
Collect all VALID masks info from json files 
'''
segmentation_details = []

for img_name in images_names:
    file_path = os.path.join(mask_info_dir, img_name+'.json')
    file = open(file_path, 'r')
    info = json.load(file)
    file.close()
    
    if info:
        if not info[0]['failed']:
            segmentation_details += [info[0]]
            segmentation_details[-1]['name'] = img_name

print('Number of segmentation info files: {}'.format(len(segmentation_details)))

Number of segmentation info files: 4528


In [47]:
'''
Masks info to csv
'''
file_path = os.path.join(meta_dir, mask_filename+'.csv')

# Determine the union of all image metadata fields
metadata_fields = set(
    segmentation_details[0].keys()
)

metadata_fields = sorted(metadata_fields)

# Write the metadata to a CSV
print('Writing metadata to CSV: {}'.format(mask_filename+'.csv'))

with open(file_path, 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadata_fields)
    csvWriter.writeheader()
    
    k=-1
    for info in segmentation_details:
        k+=1
        row_dict = info.copy()
        csvWriter.writerow(row_dict)


Writing metadata to CSV: masksdata.csv


In [48]:
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,_id,created,failed,name,skill
0,5463934bbae47821f88025ad,2014-11-12T17:05:15.624000+00:00,False,ISIC_0000000,expert
1,545d0fadbae47821f880257b,2014-11-07T18:30:05.422000+00:00,False,ISIC_0000001,expert
2,545d0c6bbae47821f880256c,2014-11-07T18:16:11.251000+00:00,False,ISIC_0000002,expert
3,545d0aeabae47821f880255c,2014-11-07T18:09:46.020000+00:00,False,ISIC_0000003,expert
4,54ea8ac3bae47871b5e00cd2,2015-02-23T02:04:51.455000+00:00,False,ISIC_0000004,expert


In [50]:
def load_mask(mask_id, img_name):
    img_file = api.get('segmentation/{}/mask'.format(mask_id))
    img_file.raise_for_status()
    file_path = os.path.join(mask_dir, '{}_mask.png'.format(img_name))

    with open(file_path, 'wb') as out_stream:
        for chunk in img_file:
            out_stream.write(chunk)    
            
masks_names = list(data['name'])
masks_id = list(data['_id'])

with ThreadPoolExecutor(12) as e: e.map(load_mask, masks_id, masks_names)