**Using InceptionV3 and data augmentation. Script for data splitting is included.**

In [1]:
import os
import zipfile
import random
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile

In [2]:
print(tf.__version__)
from tensorflow.keras.applications.inception_v3 import InceptionV3
local_weights_file = r"D:\ML\Models\Inception\inception_v3.h5"
pretrained_model = InceptionV3(input_shape=(150, 150, 3),
                              include_top=False,
                              weights=None)
pretrained_model.load_weights(local_weights_file)

W0901 21:52:01.380081 23536 deprecation.py:506] From C:\Users\G3NZ\Anaconda3\envs\OpenCV-master-py3\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


1.14.0


In [3]:
for layer in pretrained_model.layers:
    layer.trainable = False
pretrained_model.summary()

Model: "inception_v3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150, 150, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 74, 74, 32)   864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 74, 74, 32)   96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 74, 74, 32)   0           batch_normalization[0][0]        
_______________________________________________________________________________________

In [4]:
last_layer = pretrained_model.get_layer('mixed7')
print('last layer output shape: ', last_layer.output_shape)
last_output = last_layer.output

# Flatten the output layer to 1 dimension
x = layers.Flatten()(last_output)
# Add a fully connected layer with 1,024 hidden units and ReLU activation
x = layers.Dense(1024, activation='relu')(x)
# Add a dropout rate of 0.2
x = layers.Dropout(0.2)(x)                  
# Add a final sigmoid layer for classification
x = layers.Dense  (1, activation='sigmoid')(x)   
model = Model( pretrained_model.input, x) 

last layer output shape:  (None, 7, 7, 768)


In [7]:
base_dir = r"D:\ML\Datasets\cats_and_dogs\splitted"
dataset_splits = ["train", "validation"]
dataset_classes = ["cats", "dogs"]

# getting dataset split paths
dataset_split_paths = {}
for word in dataset_splits:
    dataset_split_paths[word] = os.path.join(base_dir, word)         
print("\ndataset_split_paths Dictionary:")
print("----------------------------------")
print(dataset_split_paths)


splits_classpath_dic = {}
# getting paths of classes within splits
for word in dataset_splits:
    splits_classpath_dic[word] = list()
    for dataset_class in dataset_classes:
        splits_classpath_dic[word].append(os.path.join(dataset_split_paths[word], dataset_class))

print("\nsplits_classpath_dic Dictionary:")
print("----------------------------------")
print(splits_classes_dic)

print("\nNumber of files in each split class:")
print("----------------------------------")
for split, class_paths in splits_classpath_dic.items():
    for i, class_path in enumerate(class_paths):
        print("'" + dataset_classes[i] + "'' class in '" + split + "' split has " + str(len(os.listdir(class_path))) + " files")
        print("Path to class:")
        print(class_path)
        print()



dataset_split_paths Dictionary:
----------------------------------
{'train': 'D:\\ML\\Datasets\\cats_and_dogs\\splitted\\train', 'validation': 'D:\\ML\\Datasets\\cats_and_dogs\\splitted\\validation'}

splits_classpath_dic Dictionary:
----------------------------------


NameError: name 'splits_classes_dic' is not defined

In [6]:
def split_dataset(source: str, destination: str, split_ratios: dict = {"train":.8, "test":.1, "validation":.1}, show_messages: bool =True):
    """
    This function splits a dataset into train, test, validation directories within 
    class names according to given ratios. It shuffles the data before splitting. 
    ...

    Parameters
    ----------
    source : str
        path of directory containing dataset classes
    destination : str
        path of directory to store dataset classes which contain splitted data
    split_ratios : dict
        Dictionary with keys: train, test, validation
        Value of keys are ratios for splitting data
    show_messages : bool
        if True then it print details about dataset whilst splitting 

    """ 
    
    assert (split_ratios["train"] + split_ratios["test"] + split_ratios["validation"]) == 1, \
    "Incorrect split ratios passed. Make sure sum of ratios == 1."
    
    if show_messages:
            print("Splitting according to these ratios:")
            print("----------------------------------")
            for split, ratio in split_ratios.items():
                print("\t" + split + ": " + str(ratio))
            print()
            
    if not os.path.exists(destination):
        os.makedirs(destination)
    
    # getting classes in dataset
    dataset_classes = os.listdir(source)
    
    # iterating through classes and getting files into dictionary
    dataset = {}
    for dataset_class in dataset_classes:
        # getting source class path
        class_dir = os.path.join(source, dataset_class)
        # getting paths of working files in a list
        class_dataset = []
        for filename in os.listdir(class_dir):
            file_path = os.path.join(class_dir, filename)
            # ignore corrupted files
            if os.path.getsize(file_path) > 0:
                class_dataset.append(filename)
            else:
                print(filename + " has zero size, so ignoring.")
        # shuffling before saving
        shuffled_set = random.sample(class_dataset, len(class_dataset))
        # saving files for each class in dictionary
        dataset[dataset_class] = shuffled_set

    # making folders for splitted data
    splits = ["Train", "Test", "Validation"]
    for split in splits:
        split_dir = os.path.join(destination, split)
        os.mkdir(split_dir)
    
    # for each class save files to train, test, validation folder
    for dataset_class in dataset_classes:
        
        # splitting data of each class
        train_set_len = int(len(dataset[dataset_class]) * split_ratios["train"])
        test_set_len = int(len(dataset[dataset_class]) * split_ratios["test"])
        validation_set_len = int(len(dataset[dataset_class]) * split_ratios["validation"])
        
        if show_messages:
                print("'" + dataset_class + "' Class Details:")
                print("----------------------------------")
                print("\tNumber of files in Training set: " + str(train_set_len))
                print("\tNumber of files in Testing set: " + str(test_set_len))
                print("\tNumber of files in Validation set: " + str(validation_set_len))
                print()
        
        # saving files 
        for split in splits:
            class_dir = os.path.join(source, dataset_class)
            split_dir = os.path.join(destination, split)
            class_destination = os.path.join(split_dir, dataset_class)
            try:
                # making class directory within train/test/validation folder if it doesn't exist
                os.mkdir(class_destination)
            except:
                pass
            
            # getting files ready to copy according to the split
            if split == "Train":
                splitset = dataset[dataset_class][0: train_set_len]
            elif split == "Test":
                splitset = dataset[dataset_class][train_set_len: train_set_len+test_set_len]
            elif split == "Validation":
                splitset = dataset[dataset_class][train_set_len+test_set_len: train_set_len+test_set_len+validation_set_len]

            # copying files to folder
            for file in splitset:
                src_file = os.path.join(class_dir, file)
                dest_file = os.path.join(class_destination, file)
                copyfile(src_file, dest_file)

    if show_messages:
        print("Successfuly copied files to destination folder.")

In [8]:
# splitting dataset
dataset_src = r"D:\ML\Datasets\cats and dogs\PetImages"
dataset_dest = r"D:\ML\Datasets\cats and dogs\PetImages\splitted"
split_dataset(dataset_src, dataset_dest)

Splitting according to these ratios:
----------------------------------
	train: 0.8
	test: 0.1
	validation: 0.1

666.jpg has zero size, so ignoring.
11702.jpg has zero size, so ignoring.
Test has zero size, so ignoring.
Train has zero size, so ignoring.
Validation has zero size, so ignoring.


FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'D:\\ML\\Datasets\\cats and dogs\\PetImages\\splitted\\Train'

In [9]:
# defining model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [10]:
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['acc'])


W0901 21:53:46.175980 23536 deprecation.py:323] From C:\Users\G3NZ\Anaconda3\envs\OpenCV-master-py3\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
dataset = r"D:\ML\Datasets\cats_and_dogs_filtered"
TRAINING_DIR = os.path.join(dataset, "train")
train_datagen = ImageDataGenerator(rescale=1.0/255.,
                                   rotation_range=40,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=True,
                                   fill_mode="nearest")
                                   
train_generator = train_datagen.flow_from_directory(TRAINING_DIR,
                                                    batch_size=100,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

VALIDATION_DIR = os.path.join(dataset, "validation")
validation_datagen = ImageDataGenerator(rescale=1.0/255.)
validation_generator = validation_datagen.flow_from_directory(VALIDATION_DIR,
                                                              batch_size=100,
                                                              class_mode='binary',
                                                              target_size=(150, 150))

# Expected Output:
# Found 22498 images belonging to 2 classes.
# Found 2500 images belonging to 2 classes.

Found 2000 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.


In [13]:
# Note that this may take some time.
history = model.fit_generator(train_generator,
                              epochs=10,
                              verbose=1,
                              validation_data=validation_generator)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
%matplotlib inline

import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['acc']
val_acc=history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.figure()


# Desired output. Charts with training and validation metrics. No crash :)