In [1]:
from concurrent.futures import ThreadPoolExecutor
import urllib.request
import concurrent
import pandas as pd
import os
from tqdm import tqdm
from PIL import Image

### Get Missing images

In [2]:
def get_file_path_from_url(url):
    return '-'.join(url.split('/')[5:])

In [3]:
def get_missing_images():
    all_images = pd.read_csv('products.csv')['searchImage'].tolist()
    downloaded_images = set(os.listdir('assets'))

    print('Download Images: ', len(downloaded_images))

    missing_images_set, missing_images = set(), []

    for url in all_images:
        if get_file_path_from_url(url) in downloaded_images:
            continue
        if url in missing_images_set:
            continue
        missing_images.append(url)
        missing_images_set.add(url)

    print('Missing images: ', len(missing_images))
    return missing_images
missing_images = get_missing_images()

Download Images:  307624
Missing images:  7


### Download Images

In [4]:
def download_image(image_url):
    print(image_url)
    file_name = 'assets/' + get_file_path_from_url(image_url)
    urllib.request.urlretrieve(image_url, file_name)

def multithreaded_download(images, num_threads):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(download_image, image) for image in tqdm(images)]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result(timeout=0.5)
            except concurrent.futures.TimeoutError:
                print("A thread took too long and was terminated.")

def singlethreaded_download(images):
    for image in images:
        download_image(image)
# multithreaded_download(missing_images, 30)
# singlethreaded_download(missing_images)

### Resize Images

In [5]:
def resize_images(files, size=(224, 224)):
    input_folder = 'assets'
    output_folder = 'assets-224-224'
    for file in files:
        img = Image.open(input_folder + '/' + file)
        img = img.resize(size, Image.BICUBIC)
        img.save(output_folder + '/' + file)
        del img

def get_image_files():
    processed = set(os.listdir('assets-224-224'))
    image_files = [x for x in os.listdir('assets') if x not in processed]
    return image_files

def multithreaded_resize_images(num_threads=10, batch_size=200):
    image_files = get_image_files()
    print('received image files')
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        for i in range(0, len(image_files), batch_size):
            executor.submit(resize_images, image_files[i:min(i+batch_size, len(image_files))])