In [1]:
import os
import random
import shutil
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

src = 'dataset/PetImages/'

# Check if the dataset has been downloaded. If not, direct user to download the dataset first
if not os.path.isdir(src):
    print("""
          Dataset not found in your computer.
          Please follow the instructions in the link below to download the dataset:
          https://raw.githubusercontent.com/PacktPublishing/Neural-Network-Projects-with-Python/master/chapter4/how_to_download_the_dataset.txt
          """)
    quit()

from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense, Flatten
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [6]:

def train_test_split(src_folder, train_size = 0.8):
    # Make sure we remove any existing folders and start from a clean slate
    shutil.rmtree(src_folder+'Train/Cat/', ignore_errors=True)
    shutil.rmtree(src_folder+'Train/Dog/', ignore_errors=True)
    shutil.rmtree(src_folder+'Test/Cat/', ignore_errors=True)
    shutil.rmtree(src_folder+'Test/Dog/', ignore_errors=True)

    # Now, create new empty train and test folders
    os.makedirs(src_folder+'Train/Cat/')
    os.makedirs(src_folder+'Train/Dog/')
    os.makedirs(src_folder+'Test/Cat/')
    os.makedirs(src_folder+'Test/Dog/')

    # Get the number of cats and dogs images
    _, _, cat_images = next(os.walk(src_folder+'Cat/'))
    files_to_be_removed = ['Thumbs.db', '666.jpg', '835.jpg']
    for file in files_to_be_removed:
        cat_images.remove(file)
    num_cat_images = len(cat_images)
    num_cat_images_train = int(train_size * num_cat_images)
    num_cat_images_test = num_cat_images - num_cat_images_train

    _, _, dog_images = next(os.walk(src_folder+'Dog/'))
    files_to_be_removed = ['Thumbs.db', '11702.jpg']
    for file in files_to_be_removed:
        dog_images.remove(file)
    num_dog_images = len(dog_images)
    num_dog_images_train = int(train_size * num_dog_images)
    num_dog_images_test = num_dog_images - num_dog_images_train

    # Randomly assign images to train and test
    cat_train_images = random.sample(cat_images, num_cat_images_train)
    for img in cat_train_images:
        shutil.copy(src=src_folder+'Cat/'+img, dst=src_folder+'Train/Cat/')
    cat_test_images  = [img for img in cat_images if img not in cat_train_images]
    for img in cat_test_images:
        shutil.copy(src=src_folder+'Cat/'+img, dst=src_folder+'Test/Cat/')

    dog_train_images = random.sample(dog_images, num_dog_images_train)
    for img in dog_train_images:
        shutil.copy(src=src_folder+'Dog/'+img, dst=src_folder+'Train/Dog/')
    dog_test_images  = [img for img in dog_images if img not in dog_train_images]
    for img in dog_test_images:
        shutil.copy(src=src_folder+'Dog/'+img, dst=src_folder+'Test/Dog/')

    # remove corrupted exif data from the dataset
    remove_exif_data(src_folder+'Train/')
    remove_exif_data(src_folder+'Test/')

# helper function to remove corrupt exif data from Microsoft's dataset
def remove_exif_data(src_folder):
    _, _, cat_images = next(os.walk(src_folder+'Cat/'))
    for img in cat_images:
        try:
            imag = Image.open(src_folder+'Cat/'+img)
            exif_data = imag._getexif()
        except ValueError as err:
            print(err)

    _, _, dog_images = next(os.walk(src_folder+'Dog/'))
    for img in dog_images:
        try:
            imag = Image.open(src_folder+'Dog/'+img)
            exif_data = imag._getexif()
        except ValueError as err:
            print(err)

In [7]:
# create the train/test folders if it does not exists already
if not os.path.isdir(src+'train/'):
    train_test_split(src)

In [8]:
# Define hyperparameters
INPUT_SIZE = 48 #Change this to 48 if the code is taking too long to run
BATCH_SIZE = 16
STEPS_PER_EPOCH = 200
EPOCHS = 3

vgg16 = VGG16(include_top=False, weights='imagenet', input_shape=(INPUT_SIZE,INPUT_SIZE,3))

# Freeze the pre-trained layers
for layer in vgg16.layers:
    layer.trainable = False

Instructions for updating:
Colocations handled automatically by placer.
Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [9]:

# Add a fully connected layer with 1 node at the end 
input_ = vgg16.input
output_ = vgg16(input_)
last_layer = Flatten(name='flatten')(output_)
last_layer = Dense(1, activation='sigmoid')(last_layer)
model = Model(input=input_, output=last_layer)

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

training_data_generator = ImageDataGenerator(rescale = 1./255)
testing_data_generator = ImageDataGenerator(rescale = 1./255)

training_set = training_data_generator.flow_from_directory(src+'Train/',
                                                target_size = (INPUT_SIZE, INPUT_SIZE),
                                                batch_size = BATCH_SIZE,
                                                class_mode = 'binary')

test_set = testing_data_generator.flow_from_directory(src+'Test/',
                                             target_size = (INPUT_SIZE, INPUT_SIZE),
                                             batch_size = BATCH_SIZE,
                                             class_mode = 'binary')

model.fit_generator(training_set, steps_per_epoch = STEPS_PER_EPOCH, epochs = EPOCHS, verbose=1)

score = model.evaluate_generator(test_set, steps=100)

for idx, metric in enumerate(model.metrics_names):
    print("{}: {}".format(metric, score[idx]))



Found 19997 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3
loss: 0.5532941031455993
acc: 0.70125
