In [None]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
%matplotlib inline

# Set up paths

In [None]:
# TODO: change this to your local paths
dataset_root = '~/data/datasets/dogs-vs-cats/'
images_folder = 'train'

# DON'T TOUCH THIS
dataset_root = os.path.expanduser(dataset_root)
images_folder = os.path.join(dataset_root, images_folder)

# Inspect

In [None]:
images_list = [os.path.join(images_folder, p) for p in os.listdir(images_folder)]
cat_list = [i for i in train_list if 'cat' in os.path.basename(i)]
dog_list = [i for i in train_list if 'dog' in os.path.basename(i)]

train_cats = cat_list[:11000]
train_dogs = dog_list[:11000]
val_cats = cat_list[11000:]
val_dogs = dog_list[11000:]

print('Found %d cats (%d for training and %d for validation)' % (len(cat_list), len(train_cats), len(val_cats)))
print('Found %d dogs (%d for training and %d for validation)' % (len(dog_list), len(train_dogs), len(val_dogs)))

In [None]:
# TODO: change this if you want
rows, cols = 10, 30
inch_per_img = 0.7

indices = np.arange(len(images_list))
np.random.shuffle(indices)
collage = None
for i in range(rows):
    row = None
    for j in range(cols):
        idx = indices[i * cols + j]
        img = cv2.cvtColor(cv2.imread(images_list[idx]), cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (32, 32))
        if row is None:
            row = img
        else:
            row = np.concatenate([row, img], axis=1)
    if collage is None:
        collage = row
    else:
        collage = np.concatenate([collage, row], axis=0)

plt.figure(figsize=(int(inch_per_img * cols), int(inch_per_img * rows)))
plt.axis('off')
plt.imshow(collage, 'gray')

# Saving

In [None]:
train_total = len(train_cats) + len(train_dogs)
val_total = len(val_cats) + len(val_dogs)

train_set = h5py.File(os.path.join(dataset_root, 'train.h5'), 'w')
train_set.create_dataset('X', (train_total, 32, 32, 3), dtype=np.uint8)
train_set.create_dataset('Y', (train_total, 2), dtype=np.uint8)
x_train = train_set['X']
y_train = train_set['Y']

val_set = h5py.File(os.path.join(dataset_root, 'val.h5'), 'w')
val_set.create_dataset('X', (train_total, 32, 32, 3), dtype=np.uint8)
val_set.create_dataset('Y', (train_total, 2), dtype=np.uint8)
x_val = train_set['X']
y_val = train_set['Y']

counter = 0
for fname in tqdm(train_cats, desc='train cats', ascii=True):
    img = cv2.cvtColor(cv2.imread(fname), cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (32, 32))
    x_train[counter] = img
    y_train[counter] = 0
    counter += 1

for fname in tqdm(train_dogs, desc='train dogs', ascii=True):
    img = cv2.cvtColor(cv2.imread(fname), cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (32, 32))
    x_train[counter] = img
    y_train[counter] = 1
    counter += 1

counter = 0
for fname in tqdm(val_cats, desc='val cats', ascii=True):
    img = cv2.cvtColor(cv2.imread(fname), cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (32, 32))
    x_val[counter] = img
    y_val[counter] = 0
    counter += 1

for fname in tqdm(val_dogs, desc='val dogs', ascii=True):
    img = cv2.cvtColor(cv2.imread(fname), cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (32, 32))
    x_val[counter] = img
    y_val[counter] = 1
    counter += 1

train_set.close()
val_set.close()
with open(os.path.join(dataset_root, 'class_names.txt'), 'w') as f:
    f.write('cat\ndog\n')