# Introduction

This is a program created to classify between cats and dogs, with and without applying PCA. This program will be executed by defining these prerequisite functions:

1. Data preparing (with and without PCA)
2. Data Augmentaton and Data Preprocessing
3. Build Model
4. Compile Model
5. Plot Tensorboard
6. Train Model and Predict
7. Create confusion matrix

After those, we will run each functions in order using PCA and without using PCA. By the end of this project, the training speed and model performance of those two will be compared.

NOTE: Simply run all to achieve all the objectives in this project.

# 1. Data preparing (with and without PCA)

In [1]:
import zipfile
import shutil
import os
from PIL import Image
import numpy as np
from sklearn.decomposition import PCA
from numpy import asarray

def data_preparing():
  # Refresh data everytime this function is started
  if os.path.exists('/tmp/cats_and_dogs_filtered'):
    shutil.rmtree('/tmp/cats_and_dogs_filtered')
  # Load data
  !wget --no-check-certificate \
    https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip \
    -O /tmp/cats_and_dogs_filtered.zip
  local_zip = '/tmp/cats_and_dogs_filtered.zip'
  zip_ref = zipfile.ZipFile(local_zip, 'r')
  zip_ref.extractall('/tmp')
  zip_ref.close()
  base_dir = '/tmp/cats_and_dogs_filtered'
  # Create the directory path
  train_dir = os.path.join(base_dir, 'train')
  validation_dir = os.path.join(base_dir, 'validation')
  test_dir = os.path.join(base_dir, 'test')
  # Directory with our training cat pictures
  train_cats_dir = os.path.join(train_dir, 'cats')
  # Directory with our training dog pictures
  train_dogs_dir = os.path.join(train_dir, 'dogs')
  # Directory with our validation cat pictures
  validation_cats_dir = os.path.join(validation_dir, 'cats')
  # Directory with our validation dog pictures
  validation_dogs_dir = os.path.join(validation_dir, 'dogs')
  # Directory with our test cat pictures
  test_cats_dir = os.path.join(test_dir, 'cats')
  # Directory with our test dogs pictures
  test_dogs_dir = os.path.join(test_dir, 'dogs')
  # Create the test directories if they don't exist
  os.makedirs(test_cats_dir, exist_ok=True)
  os.makedirs(test_dogs_dir, exist_ok=True)
  # Move 200 cat images to the test_cats_dir
  cat_filenames = os.listdir(train_cats_dir)[:200]
  for cat_filename in cat_filenames:
    src = os.path.join(train_cats_dir, cat_filename)
    dst = os.path.join(test_cats_dir, cat_filename)
    shutil.move(src, dst)
  # Move 200 dog images to the test_dogs_dir
  dog_filenames = os.listdir(train_dogs_dir)[:200]
  for dog_filename in dog_filenames:
    src = os.path.join(train_dogs_dir, dog_filename)
    dst = os.path.join(test_dogs_dir, dog_filename)
    shutil.move(src, dst)
  return train_dir, validation_dir, test_dir

def data_preparing_with_pca(n_components):
  # Refresh data everytime this function is started
  if os.path.exists('/tmp/cats_and_dogs_filtered'):
    shutil.rmtree('/tmp/cats_and_dogs_filtered')
  # Load data
  !wget --no-check-certificate \
    https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip \
    -O /tmp/cats_and_dogs_filtered.zip
  local_zip = '/tmp/cats_and_dogs_filtered.zip'
  zip_ref = zipfile.ZipFile(local_zip, 'r')
  zip_ref.extractall('/tmp')
  zip_ref.close()
  base_dir = '/tmp/cats_and_dogs_filtered'
  # Create separate PCA instances for each channel
  pca_red = PCA(n_components)
  pca_green = PCA(n_components)
  pca_blue = PCA(n_components)
  # Create the directory path
  train_dir = os.path.join(base_dir, 'train')
  validation_dir = os.path.join(base_dir, 'validation')
  test_dir = os.path.join(base_dir, 'test')
  # Directory with our training cat pictures
  train_cats_dir = os.path.join(train_dir, 'cats')
  # Directory with our training dog pictures
  train_dogs_dir = os.path.join(train_dir, 'dogs')
  # Directory with our validation cat pictures
  validation_cats_dir = os.path.join(validation_dir, 'cats')
  # Directory with our validation dog pictures
  validation_dogs_dir = os.path.join(validation_dir, 'dogs')
  # Directory with our test cat pictures
  test_cats_dir = os.path.join(test_dir, 'cats')
  # Directory with our test dogs pictures
  test_dogs_dir = os.path.join(test_dir, 'dogs')
  # Load the cat/cog training/validation images, apply PCA, and save
  image_file_path_train_cats = [os.path.join(train_cats_dir, filename) for filename in os.listdir(train_cats_dir)]
  image_file_path_validation_cats = [os.path.join(validation_cats_dir, filename) for filename in os.listdir(validation_cats_dir)]
  image_file_path_train_dogs = [os.path.join(train_dogs_dir, filename) for filename in os.listdir(train_dogs_dir)]
  image_file_path_validation_dogs = [os.path.join(validation_dogs_dir, filename) for filename in os.listdir(validation_dogs_dir)]
  for (path,dir) in zip([image_file_path_train_cats,image_file_path_validation_cats,image_file_path_train_dogs,
                    image_file_path_validation_dogs],
                     [train_cats_dir,validation_cats_dir,train_dogs_dir,validation_dogs_dir]):
    for i, image_path in enumerate(path):
      # Load the original image
      original_image = Image.open(image_path)
      # Separate the image into its RGB channels
      red_channel, green_channel, blue_channel = original_image.split()
      # Convert each channel to NumPy arrays
      red_channel = np.array(red_channel)
      green_channel = np.array(green_channel)
      blue_channel = np.array(blue_channel)
      # Apply PCA to the red channel
      pca_red = pca_red.fit(red_channel)
      red_channel = pca_red.transform(red_channel)
      red_channel = pca_red.inverse_transform(red_channel)
      # Apply PCA to the green channel
      pca_green = pca_green.fit(green_channel)
      green_channel = pca_green.transform(green_channel)
      green_channel = pca_green.inverse_transform(green_channel)
      # Apply PCA to the blue channel
      pca_blue = pca_blue.fit(blue_channel)
      blue_channel = pca_blue.transform(blue_channel)
      blue_channel = pca_blue.inverse_transform(blue_channel)
      # Create an image from the reconstructed channels
      reconstructed_image = Image.merge("RGB", [
          Image.fromarray(red_channel.astype(np.uint8)),
          Image.fromarray(green_channel.astype(np.uint8)),
          Image.fromarray(blue_channel.astype(np.uint8))
      ])
      # Define the new file path for the PCA-reversed image
      new_image_path = os.path.join(dir, f'reconstructed_{i}.jpg')
      # Save the PCA-reversed image as JPEG
      reconstructed_image.save(new_image_path)
    # Delete original images
    for image_path in path:
      os.remove(image_path)
  # Create the test directories if they don't exist
  os.makedirs(test_cats_dir, exist_ok=True)
  os.makedirs(test_dogs_dir, exist_ok=True)
  # Move 200 cat images to the test_cats_dir
  cat_filenames = os.listdir(train_cats_dir)[:200]
  for cat_filename in cat_filenames:
    src = os.path.join(train_cats_dir, cat_filename)
    dst = os.path.join(test_cats_dir, cat_filename)
    shutil.move(src, dst)
  # Move 200 dog images to the test_dogs_dir
  dog_filenames = os.listdir(train_dogs_dir)[:200]
  for dog_filename in dog_filenames:
    src = os.path.join(train_dogs_dir, dog_filename)
    dst = os.path.join(test_dogs_dir, dog_filename)
    shutil.move(src, dst)
  return train_dir, validation_dir, test_dir

# 2. Data Augmentation and Data Preprocessing

In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import Sequential
from tensorflow.keras import layers

# First create data augmentation object for better training quality (apply this to train_datagen only)
data_augmentation = Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
        layers.RandomZoom(0.1),
        layers.RandomContrast(factor=0.1),
        layers.RandomTranslation(height_factor=0.1, width_factor=0.1),
    ],
    name="img_augmentation"
)

def data_preprocessing(train_dir,validation_dir,test_dir):
  # All images will be rescaled by 1./255, apply data_augmentation to train_datagen
  train_datagen = ImageDataGenerator(rescale=1./255,preprocessing_function=data_augmentation)
  val_datagen = ImageDataGenerator(rescale=1./255)
  test_datagen = ImageDataGenerator(rescale=1./255)
  # Flow training images in batches of 20 using train_datagen generator
  train_generator = train_datagen.flow_from_directory(
        train_dir,  # This is the source directory for training images
        target_size=(384, 384),  # All images will be resized to 384x384
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')
  # Flow validation images in batches of 20 using val_datagen generator
  validation_generator = val_datagen.flow_from_directory(
        validation_dir,
        target_size=(384, 384),
        batch_size=20,
        class_mode='binary')
  # Flow test images in batches of 20 using test_datagen generator
  test_generator = test_datagen.flow_from_directory(
    test_dir,  # This is the source directory for test images
    target_size=(384, 384),  # All images will be resized to 384x384 (same as training and validation)
    batch_size=20,  # Use a batch size that matches your training and validation generators
    class_mode=None,  # Since this is the test set, you don't need class labels
    shuffle=False  # Set to False to ensure the order of predictions matches the order of test images
    )
  return train_generator, validation_generator, test_generator

#3. Build Model

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dropout, Dense, Reshape
import tensorflow_hub as hub

def build_model():
  model = tf.keras.Sequential([
    hub.KerasLayer("https://www.kaggle.com/models/google/efficientnet-v2/frameworks/TensorFlow2/variations/imagenet1k-s-classification/versions/2", trainable=False),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')  # For binary classification
    ])
  model.build([None,384,384,3])
  return model

#4. Compile Model

In [4]:
from tensorflow.keras.optimizers import Adam

def compile_model(model):
  model.compile(loss='binary_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=['acc'])
  return model

#5. Plot Tensorboard

In [5]:
from tensorflow.keras.callbacks import TensorBoard
from datetime import datetime

def plot_tensorboard(base_log_dir):
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = f"{base_log_dir}/{current_time}"
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    return tensorboard_callback

#6. Train Model and Predict

In [6]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,           # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore the model weights from the epoch with the best validation loss
)

def train_model_predict():
  history = model.fit(
    train_generator,
    steps_per_epoch=80, #total images(1600)=steps(80)*batch size(20)
    epochs=15,
    validation_data=validation_generator,
    validation_steps=50, #total images(1000)=steps(50)*batch size(20)
    verbose=1,
    callbacks=[tensorboard_callback,early_stopping])  # Add this line
  predictions = model.predict_generator(test_generator, steps=len(test_generator), verbose=1)
  return predictions

#7. Create Confusion Matrix

In [7]:
from sklearn.metrics import confusion_matrix, classification_report

def create_confusionmatrix():
  # Assuming you have actual labels for your test data (true_labels)
  true_labels = test_generator.classes
  # Convert the predicted probabilities to binary predictions (0 or 1)
  predicted_labels = (predictions > 0.5).astype(int)
  # Create the confusion matrix
  confusion = confusion_matrix(true_labels, predicted_labels)
  # Print the confusion matrix
  print("Confusion Matrix:")
  print(confusion)
  # You can also print a classification report for more metrics
  print("Classification Report:")
  print(classification_report(true_labels, predicted_labels))

#Executing program without PCA
This will execute all the prerequisite functions without applying PCA on the data

In [8]:
# This cell executes all the functions that will become the scoring criteria without applying PCA

train_dir, validation_dir, test_dir=data_preparing()
train_generator, validation_generator, test_generator=data_preprocessing(train_dir,validation_dir,test_dir)
model=build_model()
model=compile_model(model)
tensorboard_callback=plot_tensorboard("logs/")
predictions=train_model_predict()
create_confusionmatrix()
%load_ext tensorboard
%tensorboard --logdir logs/

--2023-11-04 19:57:31--  https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.10.207, 142.251.12.207, 172.217.194.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.10.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 68606236 (65M) [application/zip]
Saving to: ‘/tmp/cats_and_dogs_filtered.zip’


2023-11-04 19:57:35 (16.8 MB/s) - ‘/tmp/cats_and_dogs_filtered.zip’ saved [68606236/68606236]

Found 1600 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.
Found 400 images belonging to 2 classes.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15


  predictions = model.predict_generator(test_generator, steps=len(test_generator), verbose=1)


Confusion Matrix:
[[200   0]
 [  3 197]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       200
           1       1.00      0.98      0.99       200

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400



<IPython.core.display.Javascript object>

#Executing program with PCA
This will execute all the prerequisite functions by applying PCA on the data

In [10]:
# This cell executes all the functions that will become the scoring criteria by applying PCA

train_dir, validation_dir, test_dir=data_preparing_with_pca(0.95) # let's try retaining 95% of the n_components
train_generator, validation_generator, test_generator=data_preprocessing(train_dir,validation_dir,test_dir)
model=build_model()
model=compile_model(model)
tensorboard_callback=plot_tensorboard("logs/")
predictions=train_model_predict()
create_confusionmatrix()
%load_ext tensorboard
%tensorboard --logdir logs/

--2023-11-04 20:20:13--  https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.170.207, 142.251.175.207, 74.125.24.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.170.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 68606236 (65M) [application/zip]
Saving to: ‘/tmp/cats_and_dogs_filtered.zip’


2023-11-04 20:20:18 (16.4 MB/s) - ‘/tmp/cats_and_dogs_filtered.zip’ saved [68606236/68606236]

Found 1600 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.
Found 400 images belonging to 2 classes.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15


  predictions = model.predict_generator(test_generator, steps=len(test_generator), verbose=1)


Confusion Matrix:
[[193   7]
 [  7 193]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       200
           1       0.96      0.96      0.96       200

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


<IPython.core.display.Javascript object>

#Conclusion

Classification of cats and dogs images without/with using PCA at 95 % variance ratio retention doesn't differ in training speed. In addition, slight decline of model performance was observed where as without using PCA the model can easily achieve 98 %-100 % score across the classification report with only 3 images mistakenly being a false negative while with using PCA gives roughly 96 % score across classification report and roughly 7 images in false positive/negative each.

One possible explanation for this is because the image dataset is clean to begin with that further PCA will only unnecessarily drop principle components and doesn't contribute anything to the cleaning of the images.