In [1]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from PIL import Image
import glob
import os

BASE_IN = 'data/unsplash-raw'
BASE_OUT = 'data/unsplash-512'
if not os.path.exists(BASE_OUT):
    os.makedirs(BASE_OUT)
DIM = (512, 512) # We are using the thumbnail method, which sets the max dimension to 512 while maintaining the original aspect ratio
Image.MAX_IMAGE_PIXELS = None # This prevents PIL from crashing when loading a large image

In [2]:
def resize_image(fname, photo_id, base_out=BASE_OUT, dim=DIM):
    try:
        img = Image.open(fname)
        img.thumbnail(dim)
        img.save(f'{base_out}/{photo_id}')
        result = True
    except:
        result = False
    return result

Get a list of all the files to resize and then run through a Thread pool.

In [3]:
fnames = glob.glob(f'{BASE_IN}/*.png')
photo_ids = [fname.split('/')[-1] for fname in fnames]
len(fnames)

25000

In [4]:
with ThreadPoolExecutor(max_workers=32) as executor:
    results = list(tqdm(executor.map(resize_image, fnames, photo_ids)))

25000it [15:32, 26.80it/s]


The earlier notebook had no controls for missing/broken links, and as a result not every file from data/unsplash_raw/ will have a corresponding resized image in data/unsplash-512/

This is a known issue with the dataset and is beyond the control of the end-user, as the license does not allow for sharing of the scraped photos.