In [1]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import numpy as np
import pandas as pd
import requests
import time
import os

BASE_PATH = 'data/unsplash-raw/'
if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)
BATCH_SIZE = 5000

In [2]:
def download_image(photo_id, photo_image_url, base_path=BASE_PATH, TIMEOUT=10):
    try:
        response = requests.get(photo_image_url, timeout=TIMEOUT)
        with open(f'{BASE_PATH}{photo_id}.png', 'wb') as binary:
            binary.write(response.content)
        result = True
    except Exception as e:
        result = False
    return result

In [3]:
df = pd.read_csv('data/photos.tsv000', sep='\t')
df.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash
0,wud-eV6Vpwo,https://unsplash.com/photos/wud-eV6Vpwo,https://images.unsplash.com/photo-143924685475...,2015-08-10 22:48:30.841999,t,4273,2392,1.79,,sergio_rola,...,,,7515660,42055,silhouette of structure under red sky,,,,,LJMymdi{1IWo}Gj[w^WVICS#bbS2
1,psIMdj26lgw,https://unsplash.com/photos/psIMdj26lgw,https://images.unsplash.com/photo-144077331099...,2015-08-28 14:49:40.016052,t,3872,2176,1.78,,xcvii,...,,,1814817,5893,selective focus photography of black animal ne...,,,,,"LKKd}R^,bJD%~q4Txu%N%gxuD$xu"
2,2EDjes2hlZo,https://unsplash.com/photos/2EDjes2hlZo,https://images.unsplash.com/photo-144683489809...,2015-11-06 18:36:17.334458,t,2560,1707,1.5,Sunset reflection over river,imthinhvu,...,,,2708347,12420,photo of body body of water during golden hour,,,,,LeI{]g9u9u%1?KV@s8R-EAf#t5aL
3,WN8kSLy8KMQ,https://unsplash.com/photos/WN8kSLy8KMQ,https://images.unsplash.com/photo-144530812443...,2015-10-20 02:29:20.267471,t,2288,1520,1.51,Hiking The Mountains,bettenz,...,,,1616448,9773,green leafed trees between two rock formations,Zion National Park,37.250981,-112.950525,65.07215,LoDv=$sjD$bc.AV@ROWCtSn~s:Rj
4,QAXDmkU60OU,https://unsplash.com/photos/QAXDmkU60OU,https://images.unsplash.com/photo-144196149785...,2015-09-11 08:51:54.202624,t,2048,1371,1.49,,j,...,,,983884,9410,landscape photography of snow covered mountain...,,,,,LUIPMT9F%LoIBax]Rkj]Aet7Rjj[


In [4]:
photo_ids = df['photo_id'].tolist()
photo_image_urls = df['photo_image_url'].tolist()
len(photo_ids)

25000

Use threads to pull images. This could probably be sped up with async processes, but this is quick enough for this example. The time delay between batches is not strictly necessary for this dataset, the Unsplash maintainers have stated that they will not block requests to these images from the Lite and Full datasets (within reason).

In [5]:
results = []
for i in range(0, len(photo_image_urls), BATCH_SIZE):
    photo_id_batch = photo_ids[i:i+BATCH_SIZE]
    photo_image_url_batch = photo_image_urls[i:i+BATCH_SIZE]
    with ThreadPoolExecutor(max_workers=32) as executor:
        batch_results = list(tqdm(executor.map(download_image, photo_id_batch, photo_image_url_batch)))
    results.extend(batch_results)
    print(f'Completed batch {i}, sleeping for 10 seconds.')
    time.sleep(10)

5000it [02:51, 29.13it/s]


Completed batch 0, sleeping for 10 seconds.


5000it [03:39, 22.75it/s]


Completed batch 5000, sleeping for 10 seconds.


5000it [03:38, 22.86it/s]


Completed batch 10000, sleeping for 10 seconds.


5000it [02:58, 28.01it/s]


Completed batch 15000, sleeping for 10 seconds.


5000it [03:16, 25.43it/s]


Completed batch 20000, sleeping for 10 seconds.


In an effort to keep things simple, I did not create retry logic in the above data pull. Take a second pass at anything with a failure flag from above.

In [6]:
df['results'] = results
df.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash,results
0,wud-eV6Vpwo,https://unsplash.com/photos/wud-eV6Vpwo,https://images.unsplash.com/photo-143924685475...,2015-08-10 22:48:30.841999,t,4273,2392,1.79,,sergio_rola,...,,7515660,42055,silhouette of structure under red sky,,,,,LJMymdi{1IWo}Gj[w^WVICS#bbS2,True
1,psIMdj26lgw,https://unsplash.com/photos/psIMdj26lgw,https://images.unsplash.com/photo-144077331099...,2015-08-28 14:49:40.016052,t,3872,2176,1.78,,xcvii,...,,1814817,5893,selective focus photography of black animal ne...,,,,,"LKKd}R^,bJD%~q4Txu%N%gxuD$xu",True
2,2EDjes2hlZo,https://unsplash.com/photos/2EDjes2hlZo,https://images.unsplash.com/photo-144683489809...,2015-11-06 18:36:17.334458,t,2560,1707,1.5,Sunset reflection over river,imthinhvu,...,,2708347,12420,photo of body body of water during golden hour,,,,,LeI{]g9u9u%1?KV@s8R-EAf#t5aL,True
3,WN8kSLy8KMQ,https://unsplash.com/photos/WN8kSLy8KMQ,https://images.unsplash.com/photo-144530812443...,2015-10-20 02:29:20.267471,t,2288,1520,1.51,Hiking The Mountains,bettenz,...,,1616448,9773,green leafed trees between two rock formations,Zion National Park,37.250981,-112.950525,65.07215,LoDv=$sjD$bc.AV@ROWCtSn~s:Rj,True
4,QAXDmkU60OU,https://unsplash.com/photos/QAXDmkU60OU,https://images.unsplash.com/photo-144196149785...,2015-09-11 08:51:54.202624,t,2048,1371,1.49,,j,...,,983884,9410,landscape photography of snow covered mountain...,,,,,LUIPMT9F%LoIBax]Rkj]Aet7Rjj[,True


In [7]:
idx = df['results'] == False
photo_ids_flag = df.loc[idx, 'photo_id'].tolist()
photo_image_urls_flag = df.loc[idx, 'photo_image_url'].tolist()

In [8]:
with ThreadPoolExecutor(max_workers=32) as executor:
    batch_results = list(tqdm(executor.map(download_image, photo_ids_flag, photo_image_urls_flag)))

128it [00:12,  9.88it/s]
