## Classification Simpsons

In [1]:
import os
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping
import matplotlib.pyplot as plt
import pandas as pd

### 1. UPLOADING THE DATABASE


In [2]:
# The directory where we will store our smaller dataset/
base_dir = "lab_01_split"

# Directories for our training,
# validation and test splits
train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')


Homer = 'homer_simpson'
Bart = 'bart_simpson'
Burns = 'charles_montgomery_burns'
Krusty = 'krusty_the_clown'
Lisa = 'lisa_simpson'
Milhouse = 'milhouse_van_houten'
Marge = 'marge_simpson'
Moe = 'moe_szyslak'
Ned = 'ned_flanders'
Principal = 'principal_skinner'


# Directory with our training images
train_homer = os.path.join(train_dir, Homer)
train_bart = os.path.join(train_dir, Bart)
train_burns = os.path.join(train_dir, Burns)
train_krusty = os.path.join(train_dir, Krusty)
train_lisa = os.path.join(train_dir, Lisa)
train_milhouse = os.path.join(train_dir, Milhouse)
train_marge = os.path.join(train_dir, Marge)
train_more = os.path.join(train_dir, Moe)
train_ned = os.path.join(train_dir, Ned)
train_principal = os.path.join(train_dir, Principal)

# Directory with our valiation images
val_homer = os.path.join(validation_dir, Homer)
val_bart = os.path.join(validation_dir, Bart)
val_burns = os.path.join(validation_dir, Burns)
val_krusty = os.path.join(validation_dir, Krusty)
val_lisa = os.path.join(validation_dir, Lisa)
val_milhouse = os.path.join(validation_dir, Milhouse)
val_marge = os.path.join(validation_dir, Marge)
val_more = os.path.join(validation_dir, Moe)
val_ned = os.path.join(validation_dir, Ned)
val_principal = os.path.join(validation_dir, Principal)

# Directory with our test images
test_homer = os.path.join(test_dir, Homer)
test_bart = os.path.join(test_dir, Bart)
test_burns = os.path.join(test_dir, Burns)
test_krusty = os.path.join(test_dir, Krusty)
test_lisa = os.path.join(test_dir, Lisa)
test_milhouse = os.path.join(test_dir, Milhouse)
test_marge = os.path.join(test_dir, Marge)
test_more = os.path.join(test_dir, Moe)
test_ned = os.path.join(test_dir, Ned)
test_principal = os.path.join(test_dir, Principal)


Let's control how many pictures we have in each training split (train/validation/test):

In [3]:
print('total training Homer images:', len(os.listdir(train_homer)))
print('total validation Homer images:', len(os.listdir(val_homer)))
print('total test Homer images:', len(os.listdir(test_homer)))


total training Homer images: 3144
total validation Homer images: 674
total test Homer images: 674


### 2. BUILDING THE NETWORK


In [20]:
# from tensorflow.keras import layers
# from tensorflow.keras import models

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(256, 256, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(10, activation='sigmoid'))

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 254, 254, 32)      896       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 127, 127, 32)      0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 125, 125, 64)      18496     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 62, 62, 64)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 60, 60, 128)       73856     
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 30, 30, 128)       0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 28, 28, 128)      

**COMPILING THE MODEL**


In [32]:
#from tensorflow.keras import optimizers

model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(learning_rate=1e-4),
              metrics=['acc'])


### 3. DATA PREPROCESSING

In [22]:
#from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Rescale all images by 1./255
train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)

In [23]:
# Resize all images to 150 X 150 (This is the parameter that we passed to our convnet)
train_generator = train_datagen.flow_from_directory(
        train_dir, # target directory
        class_mode='categorical') # Since we use binary_crossentropy loss, we need binary labels

Found 19330 images belonging to 10 classes.


In [24]:
validation_generator = validation_datagen.flow_from_directory(
        validation_dir, # validation directory
        class_mode='categorical')

Found 4144 images belonging to 10 classes.


In [26]:
# import PIL
# from pathlib import Path
# from PIL import UnidentifiedImageError


# path = Path(train_homer).rglob("*.jpg")
# for img_p in path:
#     try:
#         img = PIL.Image.open(img_p)
#     except PIL.UnidentifiedImageError:
#             print(img_p)

In [18]:
for data_batch, labels_batch in train_generator:
    print('data batch shape:', data_batch.shape)
    print('labels batch shape:', labels_batch.shape)
    break

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f61e87e94f0>

****************************************************
### CALLBACKS

In Keras, `callbacks` is a set of functions to be applied at given stages of the training procedure.

We are going to use two callbacks functions:

1. **EarlyStopping**, which stops the training when a monitored quantity has stopped improving. It takes the following arguments:

    * `monitor`: The quantity to be monitered. In our case, we are going to monitor `val_acc`, the accuracy computed on the validation set (data that the model has never seen before). Alternatively, you could also monitor `val_loss`, the value of cost function for our cross-validation data.
    * `min_delta`: minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.
    * `patience`: number of epochs that produced the monitored quantity with no improvement after which training will be stopped.
    * `verbose`: verbosity mode 
    * `mode`: one of auto, min, max. In *min* mode, training will stop when the quantity monitored has stopped decreasing; in *max* mode it will stop when the quantity monitored has stopped increasing; in *auto* mode, the direction is automatically inferred from the name of the monitored quantity


2. **CSVLogger**, which saves the results of each epoch to a csv file. This is fundamental to later recover the training data with no need to retrain the model.  

The full list of callbacks can be found at: https://keras.io/callbacks/

In [28]:
earlystop = EarlyStopping(
    monitor='val_acc',
    min_delta=0.001,
    patience=10,
    verbose=1,
    mode='auto')

In [30]:
csv_logger = CSVLogger('training.log', 
                       separator=',', 
                       append=False)

### 4. FITTING THE MODEL WITH A BATCH GENERATOR

At this point, we can fit our model to the data using the generator with the `fit()` method.

The method expects:

* **A Python generator** that yields batches of inputs and targets indefinitely, like our `train_generator` does. 


* `steps_per_epoch` argument: Because the data is being generated endlessly, the generator needs to know how many samples to draw from the generator before declaring an epoch over. This is the role of the `steps_per_epoch` argument: after having drawn `steps_per_epoch` batches from the generator, i.e. after having run for `steps_per_epoch` gradient descent steps, the fitting process will go to the next epoch. In our case, batches are 20-sample large, so it will take 100 batches until we see our target of 2000 samples.


* Number of `epochs` argument


* A `validation_data` argument. This argument is allowed to be a data generator itself (like our `validation_generator`, but it could be a tuple of Numpy arrays as well. 


* If you pass a `generator` as validation_data, then this generator is expected to yield batches of validation data endlessly, and thus you should also specify the `validation_steps` argument, which tells the process how many batches to draw from the validation generator for evaluation. Since we have 1000 images in our validation sample, and our batches are 20-sample large, we need (1000/20)=50 `validation_steps`.

* `callbacks`:  We can pass a list of callbacks that we previously defined.


In [33]:
history = model.fit(
      train_generator,
      steps_per_epoch=100,
      epochs=30,
      validation_data=validation_generator,
      validation_steps=50,
      callbacks=[earlystop, csv_logger]
)


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f61e854b270>


#### SAVE THE MODEL


In [None]:
model.save('cats_and_dogs_small_1.h5')


#### PLOT ACCURACY AND LOSS OF THE MODEL

In [None]:
#IF YOU ARE PATIENT ENOUGH TO WAIT FOR THE MODEL TO TRAIN


acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
#IF YOU UPLOAD EXISTING MODEL
import pandas as pd
log_data = pd.read_csv('/Computer Vision/training.log', sep=',', engine='python')

log_data.head()

In [None]:
acc = log_data['acc']
val_acc = log_data['val_acc']
loss = log_data['loss']
val_loss = log_data['val_loss']

In [None]:
# very poor but fast way to plot stuff...
ax = plt.gca()

log_data.plot(kind='line',x='epoch',y='acc',ax=ax)
log_data.plot(kind='line',x='epoch',y='val_acc', color='red', ax=ax)

plt.show()

These curves indicate *overfitting*. The training accuracy increases linearly over time, until it reaches nearly 100%, while our validation accuracy stalls at 70-72%. 

The training loss reaches its minimum after only five epochs then stalls, while the validation loss keeps decreasing linearly until it reaches nearly 0.

Because we only have relatively few training samples (2000), overfitting is a serious concern. 

You should know how to reduce overfitting with dropout and weight decay (L2 regularization). Let's now learn a technique, specific to computer vision and deep learning models: **data augmentation**.


### 5. DATA AUGMENTATION

Since we have overfitting problems, we need to augment our data. We can do this in Kears by configuring a number of random transformations to be performed on the images read by our `ImageDataGenerator` instance. For instance:


In [None]:
datagen = ImageDataGenerator(
      rotation_range=40,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True,
      fill_mode='nearest')

Here, we are performing the following operations:

* `rotation_range` randomly rotates images. It is a value in degrees (0-180) that indicates the range within which to randomly rotate pictures.

* `width_shift_range` and `height_shift_range` indicates the ranges within which to randomly translate pictures vertically or horizontally. They are ranges (as a fraction of total width or height)

* `shear_range` randomly applies shearing transformations.

* `zoom_range` randomly zooms inside pictures.

* `horizontal_flip` randomly flips half of the images horizontally -- relevant when there are no assumptions of horizontal asymmetry (e.g. real-world pictures).

* `fill_mode` is the strategy used for filling in newly created pixels, which can appear after a rotation or a width/height shift.

These are just some of the options available. For more options, see the Keras documentation
Let's take a look at our augmented images:

In [None]:
# This is module with image preprocessing utilities

fnames = [os.path.join(train_cats_dir, fname) for fname in os.listdir(train_cats_dir)]

# We select one image to "augment"
img_path = fnames[3]

# Read the image and resize it
img = image.load_img(img_path, target_size=(150, 150))

# Convert it to a Numpy array with shape (150, 150, 3)
x = image.img_to_array(img)

# Reshape it to (1, 150, 150, 3)
x = x.reshape((1,) + x.shape)

# The .flow() command below generates batches of randomly transformed images.
# It will loop indefinitely, so we need to `break` the loop at some point!
i = 0
for batch in datagen.flow(x, batch_size=1):
    plt.figure(i)
    imgplot = plt.imshow(image.array_to_img(batch[0]))
    i += 1
    if i % 4 == 0:
        break

plt.show()

By using this data augmentation configuration, our network will not see the same image twice. 

However, the input images that the network sees are highly correlated since they have the same original image in common. Thus, we will reduce overfit but just to a certain extent. 

To further reduce overfit, we now add a `Dropout` layer to our model, right before the densely-connected classifier:

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dropout(rate=0.5))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(learning_rate=1e-4),
              metrics=['acc'])


Let's train our network using data augmentation and dropout. We start without `EarlyStopping` callback. 

In [None]:
#NO EARLYSTOPPING CALLBACK
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,)

# Note that the validation data should not be augmented!
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        # This is the target directory
        train_dir,
        # All images will be resized to 150x150
        target_size=(150, 150),
        batch_size=32,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

validation_generator = validation_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size=32,
        class_mode='binary')


csv_logger = CSVLogger('training_augmented.log',
                       separator=',', 
                       append=False)


history = model.fit_generator(
      train_generator,
      steps_per_epoch=100,
      epochs=100,
      validation_data=validation_generator,
      validation_steps=50,
      callbacks=[csv_logger])

In [None]:
model.save('cats_and_dogs_small_2.h5')

Let's see if our results improved

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'go', label='Training loss')
plt.plot(epochs, val_loss, 'g', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

After using data augmentation and dropout, we have avoided overfitting: the training curves are rather closely tracking the validation curves. We now reach an accuracy of 82%, a 15% relative improvement over the non-regularized model.

We could slightly improve our accuracy by fine-tuning the network's parameters (e.g., number of layers or number of filter in the convolution layers). However, given the little data we have, it would be higher to reach a higher accuracy. 

This is why we are then going to learn how to take advantage of pre-trained models. 

**************************************************************************************************************

**MODEL FITTING WITH EarlyStopping CALLBACK**

Let's see whether our results would change by using the `EarlyStopping` callback. We expect no change in the results but also we expect to run fewer epochs. Indeed, the model fit stops after 68 epochs.


In [None]:
#WITH EARLYSTOPPING CALLBACK

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,)

# Note that the validation data should not be augmented!
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        # This is the target directory
        train_dir,
        # All images will be resized to 150x150
        target_size=(150, 150),
        batch_size=32,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

validation_generator = validation_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size=32,
        class_mode='binary')

earlystop = EarlyStopping(
    monitor='val_acc',
    min_delta=0.001,
    patience=10,
    verbose=1,
    mode='auto'
)

csv_logger = CSVLogger('training_augmented_earlystopping.log',
                       separator=',', 
                       append=False)


history = model.fit_generator(
      train_generator,
      steps_per_epoch=100,
      epochs=100,
      validation_data=validation_generator,
      validation_steps=50,
      callbacks=[earlystop, csv_logger])

In [None]:
model.save('cats_and_dogs_small_2_earlystopping.h5')

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'go', label='Training loss')
plt.plot(epochs, val_loss, 'g', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()