# Import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from joblib import Parallel, delayed
from tqdm import tqdm
from PIL import Image
import pdb

# Config

In [3]:
path_data = Path('data')

In [4]:
!ls {path_data}

cifar-10-batches-py


# Load pickle

In [5]:
path_cifar = path_data/'cifar-10-batches-py'

In [6]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [7]:
data_batches = [unpickle(path_cifar/('data_batch_' + str(i))) for i in range(1,5)]

Get labels, data, and filenames. Note that files are png by default

In [8]:
labels = np.concatenate([data_batch[b'labels'] for data_batch in data_batches])
labels.shape

(40000,)

In [9]:
data = np.concatenate([data_batch[b'data'] for data_batch in data_batches])
data.shape

(40000, 3072)

In [10]:
filenames = np.concatenate([data_batch[b'filenames'] for data_batch in data_batches])
filenames.shape

(40000,)

# Utility functions

In [11]:
def format_img(img):
    return np.transpose(img.reshape(3, 32, 32), (1, 2, 0))

In [12]:
def save_image(idx, path_root, ext, sz=None):
    label = labels[idx]    
    img = format_img(data[idx])        
    filename = Path(filenames[idx].decode('UTF-8')).with_suffix('.' + ext)
    
    if sz is not None:
        img = np.array(Image.fromarray(img).resize(sz))
    
    label_dir = path_root/str(label)
    label_dir.mkdir(parents=True, exist_ok=True)
    
    if ext == 'raw':
        img.astype('int8').tofile(label_dir/filename)
    else:
        Image.fromarray(img).save(label_dir/filename, compress_level=0)

# Save small size images

In [13]:
for ext in ['png', 'tif', 'jpg', 'raw']:
    root_path = path_data/'small_size'/ext
    Parallel(n_jobs=8)(delayed(save_image)(i, root_path, ext) for i in tqdm(range(len(filenames))));

100%|██████████| 40000/40000 [00:02<00:00, 19082.33it/s]
100%|██████████| 40000/40000 [00:02<00:00, 16768.72it/s]
100%|██████████| 40000/40000 [00:01<00:00, 24485.31it/s]
100%|██████████| 40000/40000 [00:01<00:00, 27985.31it/s]


# Save large size images

In [14]:
for ext in ['png', 'tif', 'jpg', 'raw']:
    root_path = path_data/'large_size'/ext
    Parallel(n_jobs=8)(delayed(save_image)(i, root_path, ext, (512, 512)) for i in tqdm(range(len(filenames))));

100%|██████████| 40000/40000 [00:46<00:00, 869.26it/s] 
100%|██████████| 40000/40000 [00:49<00:00, 808.90it/s]
100%|██████████| 40000/40000 [00:38<00:00, 1048.31it/s]
100%|██████████| 40000/40000 [00:42<00:00, 938.81it/s] 
