# Download Data

In [144]:
import json
import urllib
import multiprocessing
import sys
import pandas as pd
import requests
import os

## Definitions of functions used to sync images

In [165]:
def download_image(image_id, url, output_directory, timeout):
    output_path = '{}/pic_{}.png'.format(output_directory, image_id)
        
    try: 
        response = requests.get(url, timeout=timeout, stream=True)
        
        if response.status_code != requests.codes.OK:
            raise Exception('Request exceeded {} seconds.'.format(timeout))
        
        with open(output_path, 'wb') as fh:
            for chunk in response.iter_content(1024 * 1024):
                fh.write(chunk)
                
    except Exception as e:
        print('{}: {}'.format(image_id, e))

    except:
        print('Unexpected error: {}'.format(sys.exc_info()[0]))
        
    sys.stdout.flush()
    
def download_image_list(image_ids, image_id_to_url, output_directory, pool_size, timeout):
    pool = multiprocessing.Pool(pool_size)
    
    print("Attempting to download {} images in {}".format(len(image_ids), output_directory))
    
    for image_id in image_ids:
        pool.apply_async(download_image, [image_id, image_id_to_url[image_id], output_directory, timeout])

    pool.close()
    pool.join()

In [186]:
def read_json(filepath):
    with open(filepath) as data_file:    
        data = json.load(data_file)
    return data

def image_id_to_url(filepath):
    data = read_json(filepath)
    
    return dict([
        [link['image_id'], link['url'][0]] 
        for link in data['images']])
    
def images_in_directory(directory):
    return set([
        int(filename.split('_')[1].split('.')[0])
        for filename in os.listdir(directory)])

In [173]:
def sync_images(id_to_url_filepath, output_directory, pool_size, timeout):
    id_to_url = image_id_to_url(id_to_url_filepath)
    ids = id_to_url.keys()
    
    available_images = images_in_directory(output_directory)

    download_image_list(ids - available_images, id_to_url, output_directory, pool_size, timeout)
    
    return {image_id: id_to_url[image_id]
            for image_id in (ids - available_images)}
    
def iterative_image_download(id_to_url_filepath, output_directory, missing_images_path, pool_size):
    run_sync = lambda t: sync_images(id_to_url_filepath, output_directory, pool_size, t)
    
    unresolved_images = []
    for tmo in (8*[1] + [5]):
        unresolved_images = run_sync(tmo)
    
    with open(missing_images_path, 'w') as file:
        json.dump(unresolved_images, file, sort_keys=True, indent=4)

## Syncing images

In [180]:
id_to_url_filepath = '../data/id-to-url/{}.json'
images_directory = '../data/{}_images/'
missing_images_directory = '../data/missing_images_due_to_bad_download/'
missing_images_filepath = os.path.join(missing_images_directory, 'missing_{}.json')

In [181]:
for dataset in ['train', 'validation', 'test']:
    !mkdir -p {images_directory.format(dataset)}
    
!mkdir -p {missing_images_directory}

In [None]:
%%time
dataset = 'train'

iterative_image_download(
    id_to_url_filepath.format(dataset),
    images_directory.format(dataset),
    missing_images_filepath.format(dataset),
    500)

In [None]:
%%time
dataset = 'validation'

iterative_image_download(
    id_to_url_filepath.format(dataset),
    images_directory.format(dataset),
    missing_images_filepath.format(dataset),
    500)

In [None]:
%%time
dataset = 'test'

iterative_image_download(
    id_to_url_filepath.format(dataset),
    images_directory.format(dataset),
    missing_images_filepath.format(dataset),
    500)

## Zip and upload to S3

In [None]:
!sudo apt-get install zip

In [None]:
%%time
!zip -r ../data/train_images.zip ../data/train_images/

In [None]:
%%time
!zip -r ../data/validation_images.zip ../data/validation_images/

In [None]:
%%time
!zip -r ../data/train_images.zip ../data/train_images/

In [None]:
!sudo apt-get install aws-cli

In [None]:
!aws s3 cp ../data/train_images.zip s3://furniture-kaggle/
!aws s3 cp ../data/validation_images.zip s3://furniture-kaggle/
!aws s3 cp ../data/test_images.zip s3://furniture-kaggle/

In [None]:
!aws s3 cp ../data/missing_images_due_to_bad_download/missing_train.json s3://furniture-kaggle/
!aws s3 cp ../data/missing_images_due_to_bad_download/missing_validation.json s3://furniture-kaggle/
!aws s3 cp ../data/missing_images_due_to_bad_download/missing_test.json s3://furniture-kaggle/

## Exporting labels

In [187]:
def export_labels(filepath, outputpath):
    data = read_json(filepath)
        
    annotations = data['annotations']
    
    pd.DataFrame(annotations).to_csv(outputpath, index=False)

In [188]:
export_labels('../data/train.json', '../data/train-labels.csv')
export_labels('../data/validation.json', '../data/validation-labels.csv')

In [189]:
!aws s3 cp ../data/train-labels.csv s3://furniture-kaggle/
!aws s3 cp ../data/validation-labels.csv s3://furniture-kaggle/

upload: ../data/train-labels.csv to s3://furniture-kaggle/train-labels.csv
upload: ../data/validation-labels.csv to s3://furniture-kaggle/validation-labels.csv
