# 1. Imports

In [1]:
import os
import zipfile
import random
import shutil
import requests
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
import matplotlib.pyplot as plt

# 2. Download Data

In [3]:
!wget --no-check-certificate \
    "https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip" \
    -O "/tmp/cats-and-dogs.zip"

local_zip = '/tmp/cats-and-dogs.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('./data')
zip_ref.close()

--2023-11-11 02:40:05--  https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip
Resolving download.microsoft.com (download.microsoft.com)... 23.42.152.252
Connecting to download.microsoft.com (download.microsoft.com)|23.42.152.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 824887076 (787M) [application/octet-stream]
Saving to: ‘/tmp/cats-and-dogs.zip’


2023-11-11 02:40:19 (56.2 MB/s) - ‘/tmp/cats-and-dogs.zip’ saved [824887076/824887076]



# 3. Remove .db files

In [4]:
source_path = '/data/PetImages'

source_path_dogs = os.path.join(source_path, 'Dog')
source_path_cats = os.path.join(source_path, 'Cat')

# Deletes all non-image files (there are two .db files bundled into the dataset)
!find ./data/PetImages/ -type f ! -name "*.jpg" -exec rm {} +

# 4. Format Data into Training and Validation Data

In [5]:
root_dir = './data/cats-v-dogs'

# Empty directory to prevent FileExistsError is the function is run several times
if os.path.exists(root_dir):
    shutil.rmtree(root_dir)

# Creates directories for the train and test sets
def create_train_val_dirs(root_path):
    os.makedirs(root_path)

    train_dir = os.path.join(root_path, 'training')
    validation_dir = os.path.join(root_path, 'validation')

    train_cats_dir = os.path.join(train_dir, 'cats')
    train_dogs_dir = os.path.join(train_dir, 'dogs')

    validation_cats_dir = os.path.join(validation_dir, 'cats')
    validation_dogs_dir = os.path.join(validation_dir, 'dogs')

    os.makedirs(train_cats_dir)
    os.makedirs(train_dogs_dir)

    os.makedirs(validation_cats_dir)
    os.makedirs(validation_dogs_dir)

create_train_val_dirs(root_dir)

In [6]:
def split_data(source_dir, training_dir, validation_dir, split_size):
    files = os.listdir(source_dir)

    non_zero_files = [file for file in files if os.path.getsize(os.path.join(source_dir, file)) > 0]
    zero_files = [file for file in files if os.path.getsize(os.path.join(source_dir, file)) == 0]

    for file in zero_files:
        print(f'{file} is zero length, so ignoring')

    num_files = len(non_zero_files)
    shuffled = random.sample(non_zero_files, num_files)

    num_training = int(num_files * split_size)

    for i in range(num_files):
        path = os.path.join(source_dir, shuffled[i])

        if (i < num_training):
            new_path = os.path.join(training_dir, shuffled[i])
            copyfile(path, new_path)
        else:
            new_path = os.path.join(validation_dir, shuffled[i])
            copyfile(path, new_path)

In [7]:
split_data("./data/PetImages/Cat/", "./data/cats-v-dogs/training/cats/", "./data/cats-v-dogs/validation/cats/", .9)
split_data("./data/PetImages/Dog/", "./data/cats-v-dogs/training/dogs/", "./data/cats-v-dogs/validation/dogs/", .9)

print(f"There are {len(os.listdir('./data/cats-v-dogs/training/cats/'))} images of cats for training")
print(f"There are {len(os.listdir('./data/cats-v-dogs/training/dogs/'))} images of dogs for training")
print(f"There are {len(os.listdir('./data/cats-v-dogs/validation/cats/'))} images of cats for validation")
print(f"There are {len(os.listdir('./data/cats-v-dogs/validation/dogs/'))} images of dogs for validation")

666.jpg is zero length, so ignoring
11702.jpg is zero length, so ignoring
There are 11249 images of cats for training
There are 11249 images of dogs for training
There are 1250 images of cats for validation
There are 1250 images of dogs for validation


# 5. Pre-processing

In [2]:
def train_val_generators(training_dir, validation_dir):
    train_datagen = ImageDataGenerator(rescale=1./255.,
                                       rotation_range=40,
                                       width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=0.2,
                                       zoom_range=0.2,
                                       horizontal_flip=True,
                                       fill_mode='nearest')

    train_generator = train_datagen.flow_from_directory(directory=training_dir,
                                                        batch_size=128,
                                                        class_mode='binary',
                                                        target_size=(150,150))

    validation_datagen = ImageDataGenerator(rescale=1./255.)

    validation_generator = validation_datagen.flow_from_directory(directory=validation_dir,
                                                                  batch_size=32,
                                                                  class_mode='binary',
                                                                  target_size=(150,150))

    return train_generator, validation_generator

In [3]:
train_generator, validation_generator = train_val_generators('./data/cats-v-dogs/training/', './data/cats-v-dogs/validation/')

Found 22498 images belonging to 2 classes.
Found 2500 images belonging to 2 classes.


# 6. Create the Model

In [None]:
def create_model():
    model = tf.keras.