In [None]:
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from keras import utils as np_utils
from glob import glob
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import cv2
import matplotlib.pyplot as plt       
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image         
from tqdm import tqdm
from keras.applications.resnet50 import preprocess_input, decode_predictions
from PIL import ImageFile
from collections import namedtuple
import random

random.seed(8675309)
      
ImageFile.LOAD_TRUNCATED_IMAGES = True         

# Project Flow

* 0: Import Datasets
* 1: Detect Humans
* 2: Detect Dogs
* 3: Create a CNN to Classify Dog Breeds (from Scratch)
* 4: Use a CNN to Classify Dog Breeds (using Transfer Learning)
* 5: Create a CNN to Classify Dog Breeds (using Transfer Learning)
* 6: Write your Algorithm
* 7: Test Your Algorithm
* 8: Build/deploy webapp

In [None]:
def load_human_files(data_dir='./lfw'):
    # load filenames in shuffled human dataset
    human_files = np.array(glob(f'{data_dir}/*/*'))
    random.shuffle(human_files)

    # print statistics about the dataset
    print('There are %d total human images.' % len(human_files))
    return human_files

def load_dog_data(
        data_dir='./dogImages',
        ):
    """ Wrapper for function that loads the dog dataset files.

    Args:
        data_dir (str, optional): Directory to load from. Defaults to './dogImages'.
    """
    def load_dataset(path):
        data = load_files(path)
        dog_files = np.array(data['filenames'])
        dog_targets = np_utils.to_categorical(np.array(data['target']), 133)
        return dog_files, dog_targets

    # load train, test, and validation datasets
    train_files, train_targets = load_dataset('dogImages/train')
    valid_files, valid_targets = load_dataset('dogImages/valid')
    test_files, test_targets = load_dataset('dogImages/test')

    # load list of dog names
    dog_names = [item[20:-1] for item in sorted(glob("dogImages/train/*/"))]

    return (
        train_files, train_targets, 
        valid_files, valid_targets, 
        test_files, test_targets, 
        dog_names
    )

def load_open_cv_face_data(human_files, plot=True):
    # extract pre-trained face detector
    face_cascade = cv2.CascadeClassifier('haarcascades/haarcascade_frontalface_alt.xml')

    # load color (BGR) image
    img = cv2.imread(human_files[3])
    # convert BGR image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # find faces in image
    faces = face_cascade.detectMultiScale(gray)

    # print number of faces detected in the image
    print('Number of faces detected:', len(faces))

    # get bounding box for each detected face
    for (x,y,w,h) in faces:
        # add bounding box to color image
        cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
        
    # convert BGR image to RGB for plotting
    cv_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    if plot:
        # display the image, along with bounding box
        plt.imshow(cv_rgb)
        plt.show()

    return face_cascade, faces

# returns "True" if face is detected in image stored at img_path
def face_detector(img_path, face_cascade):
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray)
    return len(faces) > 0

In [None]:
(train_files, train_targets, 
        valid_files, valid_targets, 
        test_files, test_targets, 
        dog_names
) = load_dog_data()

human_files = load_human_files()
face_cascade, faces = load_open_cv_face_data(human_files, plot=True)

# __Question 1:__
Use the code cell below to test the performance of the `face_detector` function.  
- What percentage of the first 100 images in `human_files` have a detected human face?  
- What percentage of the first 100 images in `dog_files` have a detected human face? 

Ideally, we would like 100% of human images with a detected face and 0% of dog images with a detected face.  You will see that our algorithm falls short of this goal, but still gives acceptable performance.  We extract the file paths for the first 100 images from each of the datasets and store them in the numpy arrays `human_files_short` and `dog_files_short`.

__Answer:__ 

The face detector identified 100% of humans as humans but it also identfiied 12% of the dogs as human too.

In [None]:

for label, files in {
    'humans as humans': human_files[:100],
    'dogs as humans': train_files[:100],
    }.items():
    detected_count = np.array([face_detector(i, face_cascade) for i in files]).sum()
    detected_pct = detected_count/len(files)
    print(f'Deteced {detected_pct:.1%} of {len(files)} {label}')


# Question 2

In [None]:
# thank you:
# https://www.analyticsvidhya.com/blog/2019/01/build-image-classification-model-10-minutes/?utm_source=blog&utm_source=learn-image-classification-cnn-convolutional-neural-networks-5-datasets
# https://www.analyticsvidhya.com/blog/2018/12/guide-convolutional-neural-network-cnn/

class JustinDogCNN(BaseEstimator, TransformerMixin):
    def __init__(
            self, 
            loss_function='sparse_categorical_crossentropy', 
            optimizer='adam',
            eval_metrics=['accuracy']
            ):
        self.model = None
        self.loss_function = loss_function
        self.optimizer = optimizer
        self.eval_metrics = eval_metrics

    def fit(self, X, y=None):
        input_shape = X.shape[1:]
        self.model = models.Sequential()
        self.model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
        self.model.add(layers.MaxPooling2D((2, 2)))
        self.model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        self.model.add(layers.MaxPooling2D((2, 2)))
        self.model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        self.model.add(layers.Flatten())
        self.model.add(layers.Dense(64, activation='relu'))
        self.model.add(layers.Dense(10, activation='softmax'))  # Assuming 10 classes

        self.model.compile(
            optimizer=self.optimizer,
            loss=self.loss_function,
            metrics=self.eval_metrics,
            )

        self.model.fit(X, y, epochs=10, batch_size=32, verbose=1)
        return self

    def transform(self, X, y=None):
        return self.model.predict(X)
    
pipeline = Pipeline([
    ('cnn', JustinDogCNN())
])

# Fit the pipeline
# pipeline.fit(X_train, y_train)

# Evaluate the model
# test_loss, test_acc = pipeline.named_steps['cnn'].model.evaluate(X_test, y_test, verbose=2)
# print(f"Test accuracy: {test_acc}")