In [1]:
import json
import urllib
import multiprocessing
import sys
import pandas as pd
import requests

def read_json(filepath):
    with open(filepath) as data_file:    
        data = json.load(data_file)
    return data

def export_labels(filepath, outputpath):
    data = read_json(filepath)
        
    annotations = data['annotations']
    
    pd.DataFrame(annotations).to_csv(outputpath, index=False)

def download_image(image, directory, timeout):
    output_path = '{}/pic_{}.png'.format(directory, image['image_id'])
    url = image['url'][0]
        
    try: 
        request = requests.get(url, timeout=timeout, stream=True)

        with open(output_path, 'wb') as fh:
            for chunk in request.iter_content(1024 * 1024):
                fh.write(chunk)
    except:
        print(image['image_id'], end=',')
        
    sys.stdout.flush()
    
def download_pictures(filepath, directory, pool_size, timeout):
    data = read_json(filepath)
    
    images = data['images']

    pool = multiprocessing.Pool(pool_size)

    for image in images:
        pool.apply_async(download_image, [image, directory, timeout])

    pool.close()
    pool.join()
    print("Downloaded images for {}".format(filepath))

  (fname, cnt))
  (fname, cnt))


In [2]:
test_size_json = read_json('../data/test.json')

In [4]:
len(test_size_json['images'])

12800

In [5]:
export_labels('../data/validation.json', '../data/validation-lables.csv')

In [3]:
test_json = read_json('../data/validation.json')

In [6]:
pd.DataFrame(test_json['annotations']).head()

Unnamed: 0,image_id,label_id
0,1,38
1,2,63
2,3,33
3,4,126
4,5,18


In [3]:
export_labels('../data/train.json', '../data/train-labels.csv')
export_labels('../data/validation.json', '../data/validation-labels.csv')

In [None]:
%%time
download_pictures('../data/train.json', '../data/train', 500, 1)

In [None]:
%%time
download_pictures('data/validation.json', 'data/validation', 500, 1)

In [None]:
%%time
download_pictures('../data/test.json', '../data/test', 500, 1)