## Data Processing of Microsoft Cats vs. Dogs Dataset

#### Importing the libraries.
Make sure to set the Kaggle API in order to get the dataset.

In [2]:
import os
import cv2
import numpy as np
import kagglehub as kh

#### Downloading the dataset from Kaggle.
The function will return the path of the data after downloading it. \
Usually datasets will be saved in ~/.cache/kagglehub/datasets

In [3]:
path = kh.dataset_download("shaunthesheep/microsoft-catsvsdogs-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/jimmy/.cache/kagglehub/datasets/shaunthesheep/microsoft-catsvsdogs-dataset/versions/1


#### Creating a function that loads and process the data, by adding padding to make images squares, resizing the images and converting them to grayscale.

In [4]:
def process_images(path: str, size: int) -> np.ndarray:
    """
    Processes images in a specified folder, resizes them to a given size, 
    and returns the processed images as a `NumPy` array.

    Parameters
    ----------
    path : str
        Path to the folder containing the input images.
    size : int
        Target size (in pixels) to which the images should be resized.

    Returns
    -------
    np.ndarray
        Processed images as a `NumPy` array.

    Notes
    -----
    - This function assumes that all images in the `path` directory are of 
      a compatible format (e.g., JPG, PNG).
    - The images are resized to (size, size) pixels.
    """
    images = []
    for image in os.listdir(path):
        if not image.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        img = cv2.imread(f'{path}/{image}')
        if img is None:
            continue

        # Add padding to the image
        h, w, _ = img.shape
        if h > w:
            pad = (h - w) // 2
            img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=[255, 255, 255])
        else:
            pad = (w - h) // 2
            img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=[255, 255, 255])
        
        # Convert the image to greyscale
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Resize the image
        img = cv2.resize(img, (size, size))

        # Flatten the image
        img = img.flatten()

        # Append the image to the list
        images.append(img.astype(np.uint8))

    return np.array(images)

#### Processing the images.

In [5]:
cat_images = process_images(f'{path}/PetImages/Cat', size=128)
dog_images = process_images(f'{path}/PetImages/Dog', size=128)

Corrupt JPEG data: 214 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 226 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 2230 extraneous bytes before marker 0xd9
Corrupt JPEG data: 254 extraneous bytes before marker 0xd9
Corrupt JPEG data: 399 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9


#### Dividing the dataset into training_cats, training_dogs, testing_cats, and testing_dogs equally, then concatenating training images and testing images respectively.

In [6]:
np.random.seed(0)
indices_cats = np.random.permutation(cat_images.shape[0])
indices_dogs = np.random.permutation(dog_images.shape[0])

train_cats = cat_images[indices_cats[:int(0.8 * cat_images.shape[0])]]
train_dogs = dog_images[indices_dogs[:int(0.8 * dog_images.shape[0])]]
test_cats = cat_images[indices_cats[int(0.8 * cat_images.shape[0]):]]
test_dogs = dog_images[indices_dogs[int(0.8 * dog_images.shape[0]):]]

training_images = np.concatenate((train_cats, train_dogs))
training_labels = np.concatenate((np.zeros(train_cats.shape[0]), np.ones(train_dogs.shape[0])))

testing_images = np.concatenate((test_cats, test_dogs))
testing_labels = np.concatenate((np.zeros(test_cats.shape[0]), np.ones(test_dogs.shape[0])))

shuffle_train = np.random.permutation(training_images.shape[0])
shuffle_test = np.random.permutation(testing_images.shape[0])

train_images = training_images[shuffle_train]
train_labels = training_labels[shuffle_train]
test_images = testing_images[shuffle_test]
test_labels = testing_labels[shuffle_test]

#### Saving the cleaned datasets

In [7]:
if not os.path.exists('dataset'):
    os.makedirs('dataset')
np.savez_compressed('dataset/train.npz', images=train_images, labels=train_labels)
np.savez_compressed('dataset/test.npz', images=test_images, labels=test_labels)

#### Note that images were not normalized, to be able to reduce the size of saved files by using uint8 type.