In [41]:
from concurrent.futures import ThreadPoolExecutor
import urllib.request
import concurrent
import pandas as pd
import os
from tqdm import tqdm
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, InterpolationMode
import numpy as np

### Get Missing images

In [2]:
def get_file_path_from_url(url):
    return '-'.join(url.split('/')[5:])

In [42]:
def get_missing_images():
    all_images = pd.read_csv('products.csv')['searchImage'].tolist()
    downloaded_images = set(os.listdir('assets'))

    print('Download Images: ', len(downloaded_images))

    missing_images_set, missing_images = set(), []

    for url in all_images:
        if get_file_path_from_url(url) in downloaded_images:
            continue
        if url in missing_images_set:
            continue
        missing_images.append(url)
        missing_images_set.add(url)

    print('Missing images: ', len(missing_images))
    return missing_images
missing_images = get_missing_images()

### Download Images

In [4]:
def download_image(image_url):
    print(image_url)
    file_name = 'assets/' + get_file_path_from_url(image_url)
    urllib.request.urlretrieve(image_url, file_name)

def multithreaded_download(images, num_threads):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(download_image, image) for image in tqdm(images)]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result(timeout=0.5)
            except concurrent.futures.TimeoutError:
                print("A thread took too long and was terminated.")

def singlethreaded_download(images):
    for image in images:
        download_image(image)
# multithreaded_download(missing_images, 30)
# singlethreaded_download(missing_images)

In [None]:
def find_bad_images():
    dir = 'assets-224'
    images_test = os.listdir(dir)
    bad_images = []
    for image in tqdm(images_test):
        try:
            img = Image.open(dir + '/' + image)
            np.array(img)
        except Exception as e:
            print(f'{image} : {e}')
            bad_images.append(image)
    print('Number of Bad Images: ', len(bad_images))
    return bad_images
bad_images = find_bad_images()

### Resize Images

In [45]:
def resize_images(files):
    transform = Compose([
        Resize(224, interpolation=InterpolationMode.BICUBIC),
        CenterCrop(size=(224, 224))
    ])
    input_folder = 'assets'
    output_folder = 'assets-224'
    for file in files:
        try:
            img = Image.open(input_folder + '/' + file)
            img = transform(img)
            img.save(output_folder + '/' + file)
            del img
        except Exception as e:
            print(e, '===', file)

def get_image_files():
    global bad_images
    processed = set(os.listdir('assets-224'))
    image_files = [x for x in os.listdir('assets') if x not in processed and x not in bad_images]
    return image_files

def multithreaded_resize_images(num_threads=10, batch_size=200):
    image_files = get_image_files()
    print('received image files')
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        for i in range(0, len(image_files), batch_size):
            executor.submit(resize_images, image_files[i:i+batch_size])

multithreaded_resize_images()

received image files
image file is truncated (2 bytes not processed) === 16772718-2022-10-18-1190a19b-76dd-44e5-8f70-9ff68bf773bd1666082719063-Selvia-Women-Mustard-Embroidered-Sweatshirt-931666082718423-1.jpg
image file is truncated (3 bytes not processed) === 23397776-2023-5-29-858880b4-7bd8-4d08-a22b-ad07135671a01685365188380FBAROrangeCottonTop1.jpg
image file is truncated (3 bytes not processed) === 23183786-2023-5-18-f45260ee-4587-4675-b562-aa0685362a5b1684358765329IndiWeavesPackOf3PrintedPureCottonT-shirts1.jpg
image file is truncated (2 bytes not processed) === 21640448-2023-1-20-7ebf47ef-3805-44c7-89c6-0cbc436e26dd1674219094245KASSUALLYBlackSheathDress1.jpg
image file is truncated (2 bytes not processed) === 22764022-2023-4-24-d071ccc6-cb06-4a05-a85b-d70c78a2a5d21682319616423-Nejo-Print-Puff-Sleeves-Maternity-A-Line-Midi-Dress-With-Tie-1.jpg
image file is truncated (4 bytes not processed) === 23528980-2023-6-5-4e84b0b8-1842-436a-9478-a9e4fbccafb81685969028628STATUSMANTRAWomenRed