# Import

In [1]:
from functools import partial
from pathlib import Path

import torchvision.transforms.functional as F
from joblib import Parallel, delayed
from PIL import Image
from tqdm.notebook import tqdm

# Config

In [2]:
dir_data = Path('data')
dir_content = dir_data/'content'

# Utility

In [3]:
def parallel(f, it, n_jobs=6):
    Parallel(n_jobs=n_jobs)(delayed(f)(i) for i in tqdm(it));

# Clean

Delete images which are corrupt

In [4]:
dir_raw = dir_content/'raw'

In [5]:
def _validate_image(file_img):
    try:
        Image.open(file_img)
    except:
        print(f'{file_img} is corrupt; removing...')
        file_img.unlink()        

In [6]:
parallel(_validate_image, [f for f in dir_raw.rglob('*') if f.is_file()])

HBox(children=(FloatProgress(value=0.0, max=123403.0), HTML(value='')))




# Format

Resize while maintaining aspect ratio and then crop to size 

In [15]:
def _img_pipeline(file_img, sz):
    # Get image
    img = Image.open(file_img)
    img = F.resize(img, sz, )
    img = F.center_crop(img, sz)
    # Save image
    dir_img_new = file_img.parent.parent/('formatted_' + str(sz))
    dir_img_new.mkdir(parents=True, exist_ok=True)
    file_img_new = dir_img_new/file_img.name
    img.save(file_img_new)

In [16]:
def _parallel_img_pipeline(d, sz):
    parallel(partial(_img_pipeline, sz=sz), [f for f in d.rglob('*') if f.is_file()])

Do content images first

In [17]:
dir_raw = dir_content/'raw'

Do 96 first

In [18]:
_parallel_img_pipeline(dir_raw, 96)

HBox(children=(FloatProgress(value=0.0, max=123403.0), HTML(value='')))




Next do 256

In [19]:
_parallel_img_pipeline(dir_raw, 256)

HBox(children=(FloatProgress(value=0.0, max=123403.0), HTML(value='')))


