# MSS 462 Final Project

Have you ever wanted to track what your cat is doing around your house during the day?

Me too! 

This computer vision project takes an image and puts a bounding box around cats that are in the image. 
It is built with Resnet-50 as the basis, then additional fine-tuning is done using the "feline-felid" dataset from ImageNet. 
Since this is a transfer learning situation, I froze the resnet layers and just train the outputs

The model is then exported as a tflite model, so that I can run it on a Raspberry Pi.

In [None]:
import numpy as np
from PIL import Image as PIL_Image
from PIL import ImageDraw as PIL_Draw

from matplotlib import pyplot as plt

# all of the tensorflow stuff
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.utils import Sequence
from sklearn.model_selection import train_test_split

# for bringing in the dataset
from tkinter import Tk
from tkinter import filedialog as fd
from os import listdir, path
import pandas as pd
import xml.etree.ElementTree as ET

import pickle

from sklearn import metrics

### Define the model



In [None]:
# a model to just say whether a cat is in the image
def cat_tagger():
    inputs = tf.keras.Input(shape=(224,224,3))
    
    resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(224,224,3))
        
    x = resnet(inputs, training=False)
    classifier = tf.keras.layers.MaxPool2D()(x)
    classifier = tf.keras.layers.Flatten()(classifier) # flatten the whole thing
    classifier = tf.keras.layers.Dense(1024, activation='relu')(classifier) # add a training layer
    classifier = tf.keras.layers.Dense(1, activation='sigmoid')(classifier) # is it a cat?
    
    for layer in resnet.layers: # make the Resnet instance untrainable
        layer.trainable = False 

    # return classifier
    return Model(inputs=inputs, outputs=classifier)

# model to set the bounding box on the cat
def cat_boxer():
    inputs = tf.keras.Input(shape=(224,224,3))
    
    resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(224,224,3))
    for layer in resnet.layers: # make the Resnet instance untrainable
        layer.trainable = False 
        
    classifier = resnet(inputs, training=False)
    classifier = tf.keras.layers.MaxPool2D()(classifier)
    classifier = tf.keras.layers.Flatten()(classifier) # flatten the whole thing
    classifier = tf.keras.layers.Dense(1024, activation='relu')(classifier) # add a training layer
    classifier = tf.keras.layers.Dense(224*224, activation='sigmoid')(classifier) # cat bounding box
    
    return Model(inputs=inputs, outputs=classifier)

# models
def cat_tagger_basic():
    inputs = tf.keras.Input((224,224,3))
    x = tf.keras.layers.Conv2D(3,4,activation='relu')(inputs)
    x = tf.keras.layers.AvgPool2D()(x)
    x = tf.keras.layers.Conv2D(1,4,activation='relu')(x)
    # x = tf.keras.layers.AvgPool2D()(x)
    # x = tf.keras.layers.Conv2D(1,4,activation='relu')(x)
    # x = tf.keras.layers.Conv2D(1,4,activation='relu')(x)
    # x = tf.keras.layers.MaxPool2D()(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(1024, activation='relu')(x)
    x = tf.keras.layers.Dense(1024, activation='relu')(x)
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    return Model(inputs=inputs, outputs=x)



### Bring in the training data

This is the feline dataset from ImageNet

The bounding boxes are defined in PASCAL VOC XML files, so unfortunately we have to manually traverse the xml tree rather than just use xml_read

In [None]:
def parse_bbox(filepath):

    # if we don't have a bounding box for that file...
    if not path.exists(filepath):
        return None
    
    imagesize = (224,224)

    tree = ET.parse(filepath)
    root = tree.getroot() # root of the tree

    scalex = imagesize[0]/float(root.find('size/width').text)
    scaley = imagesize[1]/float(root.find('size/height').text)

    for boxes in root.iter('object'): # for each subelement
        # filename = root.find('filename').text # get the document name
        ymin = float(boxes.find('bndbox/ymin').text)
        ymax = float(boxes.find('bndbox/ymax').text)
        xmin = float(boxes.find('bndbox/xmin').text)
        xmax = float(boxes.find('bndbox/xmax').text)

    ymin = int(ymin*scaley)
    ymax = int(ymax*scaley)
    xmin = int(xmin*scalex)
    xmax = int(xmax*scalex)

    bnd_im = np.zeros(imagesize)
    bnd_im[xmin:xmax, ymin:ymax] = 1

    # return bnd_im, [xmin,ymin,xmax,ymax]
    return bnd_im

We'll be loading in a few datasets here. 

The first is all of the "cat" pictures from the imagenet dataset. After running it through the Keras pre-processing module, that will give us a Nx224x224x3 array. N is the number of images, and each image is a 224 pixel square in RGB format

We also have a array of "not cat" pictures, that will be Mx224x224x3; M is the number of "not-cat" images

In [None]:
# this create root is a gross hack that I haven't figure out how to avoid
# and don't care enough to spend time looking up

# basedir -- cat_pics, cat_bbox, and notcat_pics should all be inside
root = Tk()
base_dir = fd.askdirectory(master=root)
root.destroy()

cat_dir = path.join(base_dir, 'cat_pics')
notcat_dir = path.join(base_dir, 'notcat_pics')

if not path.exists(cat_dir):
    print('Not finding the right subdirectories!')
    KeyboardInterrupt()

cat_list = [file for file in listdir(cat_dir) if 'jpeg' in file.lower()]
cats = np.ndarray((len(cat_list), 224, 224, 3)) # 3 for rgb, one for mask
cat_i = 0
for file in cat_list:
    img = image.load_img(path.join(cat_dir,file), target_size=(224,224)) # load it in
    ary = image.img_to_array(img)
    cats[cat_i,:,:,0:3] = ary # insert the image
    cat_i += 1

notcat_rng = np.random.default_rng()
notcat_list = [file for file in listdir(notcat_dir) if 'jpeg' in file.lower()]
not_cat = np.ndarray((len(notcat_list),224,224,3)) # just use a np array
notcat_i = 0
for file in notcat_list:
    img = image.load_img(path.join(notcat_dir,file), target_size=(224,224)) # load it in
    ary = image.img_to_array(img)

    not_cat[notcat_i,:,:,:] = ary # image
    notcat_i += 1


Since the images seem to take up an inordinate amount of RAM (likely because of the size of the resnet layers) we're going to create a data generator

In [None]:
class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size = 32) -> None:
        super().__init__()
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
    
    def __len__(self):
        return int(np.ceil(len(self.x)/float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx+1)*self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx+1)*self.batch_size]
        return batch_x, batch_y

Let's pickle the entire dataset so that we can just reload it from a single file

The original data creation normally takes about 7 minutes, so that will help speed things up

In [None]:
save_fn = 'cat.pkl'
save_notcat_fn = 'not_cat.pkl'

with open(path.join(base_dir, save_fn), 'wb') as fid:
    pickle.dump(cats, fid)

with open(path.join(base_dir,save_notcat_fn), 'wb') as fid:
    pickle.dump(not_cat, fid)



if we save it, we need to be able to open it too

In [None]:
# cats
root = Tk()
open_fn = fd.askopenfilename(master=root, defaultextension='*.pkl')
root.destroy()

with open(open_fn, 'rb') as fid:
    cats = pickle.load(fid)

# not cats
root = Tk()
open_fn = fd.askopenfilename(master=root, defaultextension='*.pkl')
root.destroy()

with open(open_fn, 'rb') as fid:
    not_cat = pickle.load(fid)

### instantiate the model and train

first starting by tagging images with cats compared with images without cats

In [None]:
# labels for if the image is a cat or not
cat_notcat = np.concatenate([np.ones(cats.shape[0]),np.zeros(not_cat.shape[0])])

images = np.concatenate([cats[:,:,:,:3],not_cat])

train_im, test_im, train_lab, test_lab = train_test_split(images, cat_notcat, train_size=.8)

train_gen = DataGenerator(train_im, train_lab, 32)
test_gen = DataGenerator(test_im, test_lab, 32)

mdl = cat_tagger()
# mdl = cat_tagger_basic()
mdl.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy()])
# mdl.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=.1), loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy()])

Now that the model has been initialized and compiled, we need to fit it. I am using 10 epochs, though that can obviously get changed

In [None]:
tagger_history = mdl.fit(train_gen, verbose=True, epochs=10)

To run the model on the raspberry pi, we need to convert the model into tflite

In [None]:
mdl_fn = 'cat_tagger_model'

mdl.save(path.join(base_dir,mdl_fn))

# convert to tflite
cat_tagger_tflite = tf.lite.TFLiteConverter.from_saved_model(path.join(base_dir,mdl_fn)).convert()
with open(mdl_fn+'.tflite', 'wb') as fid:
    fid.write(cat_tagger_tflite)


And we need to be able to load the model

In [None]:
root = Tk()
mdl_fn = fd.askdirectory(master=root)
root.destroy()

mdl = tf.keras.models.load_model(mdl_fn)

Let's predict a random subset of the testing data, and see how it looks

In [None]:
image_numbers = notcat_rng.choice(test_im.shape[0], 5)

switch = ['Not a cat!','Cat!']

for im_num in image_numbers:
    print(switch[int(mdl.predict(np.expand_dims(test_im[im_num,:,:,:3], axis=0), verbose=0)[0,0])])
    display(image.array_to_img(test_im[im_num,:,:,:3]))

In [None]:
tagger_trainAcc = metrics.accuracy_score(test_lab.astype(float), mdl.predict(test_im,verbose=0)[0].ravel())

Now, to summarize the testing and training results:

First, let's take a look at the training accuracy over the course of training the model

### Training the bounding boxes

This is a separate model setup. Instead of training on a single output (is the image a cat) we are going to training on an output "mask" that is a 1 inside of the bounding box and a 0 outside of it.

This is a much smaller dataset than the previous one, since there are only a subset of the images that have been given bounding boxes.

In [None]:
# basedir -- cat_pics, cat_bbox, and notcat_pics should all be inside
root = Tk()
base_dir = fd.askdirectory(master=root)
root.destroy()

cat_dir = path.join(base_dir, 'cat_pics')
bbox_dir = path.join(base_dir, 'cat_bbox')
notcat_dir = path.join(base_dir, 'notcat_pics')

# get a list of the files that we will check
filelist = [file for file in listdir(bbox_dir) if '.xml' in file]

images = np.ndarray((len(filelist),224,224,3))
masks = np.ndarray((len(filelist),224*224))
# bboxes = np.ndarray((len(filelist),4))
cat_i = 0
for file in filelist:
    temp_img = image.load_img(path.join(cat_dir, path.splitext(file)[0])+'.JPEG', target_size=(224,224))
    images[cat_i,:,:,:] = image.img_to_array(temp_img)
    # temp_mask, bboxes[cat_i,:] = parse_bbox(path.join(bbox_dir, file))
    temp_mask = parse_bbox(path.join(bbox_dir, file))
    masks[cat_i,:] = temp_mask.ravel()
    cat_i += 1


In [None]:
# now for the image segmentation portions
train_im,test_im, train_masks,test_masks = train_test_split(images, masks, train_size=.8)
train_gen = DataGenerator(train_im, train_masks, 64)

mdl = cat_boxer()

mdl.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy()])

# fit the model
boxer_history = mdl.fit(train_gen, verbose=True, epochs=30)

In [None]:
im_rng = np.random.default_rng()

image_numbers = im_rng.choice(len(test_im), 5)

for im_num in image_numbers:
    temp_img = image.array_to_img(test_im[im_num,:,:,:])
    temp_draw = PIL_Draw.Draw(temp_img)
    temp_draw.rectangle([(bboxes[im_num,0:2])(bboxes[im_num,2:4])], outline='black')
    display(temp_draw)

Save the model as a tflite model

In [None]:
save_fn = 'cat_boxer_model'

mdl.save(path.join(base_dir, save_fn))

# convert to tflite
cat_tagger_tflite = tf.lite.TFLiteConverter.from_saved_model(path.join(base_dir,mdl_fn)).convert()
with open(mdl_fn+'.tflite', 'wb') as fid:
    fid.write(cat_tagger_tflite)

### Summarizing the training and the results

Starting with the cat tagger model, we'll look at the training metrics over epochs, and the testing accuracy for the testing dataset

In [None]:
fig, ax = plt.subplots()

ax.plot(range(1,11), tagger_history.history['binary_accuracy'])
# ax[1].plot(range(1,11), boxer_history.history['binary_accuracy'])

ax.set_title('Training Accuracy')
ax.set_xlabel('Training Epoch')
ax.set_ylabel('Accuracy Percentage')
# ax[1].set_title('Cat_Boxer Training Accuracy')

for spine in ax.spines:
    ax.spines[spine].set_visible(False)

ax.set_ylim([0.85,1.01])


## Scratch space

To delete tensors that are taking up too much memory etc

In [None]:
del(train_gen)
del(test_gen)

del(test_im)
del(test_lab)
del(train_im)
del(train_lab)