In [9]:

"""
Important Stuff
"""

# Webcam index, this should be 0 for most laptops
DEVICE = 0

# The classification neural network weights file
WEIGHTS='cellphone.h5'

"""
User Interface Stuff (You can ignore this)
"""
import IPython
import ipywidgets as widgets
from IPython.display import display

ipython = IPython.get_ipython()

style = {'description_width': 'initial'}

w_image = widgets.Image(width=224, height=224, format='png',
                        layout=widgets.Layout(width='100%'))

w_salience = widgets.Checkbox(
    value=True,
    description='Overlay salience map on image',
    style=style
)

w_conf = widgets.FloatSlider(min=0, max=1, value=0.99, step=0.01, 
                             description='Cell Phone Probability',
                             layout=widgets.Layout(width='100%'),
                             style=style)

w_drop = widgets.Dropdown(options=[
    '224x224',
    '112x112',
    '56x56',
    '28x28',
    '14x14',
    '7x7',
], style=style)
w_drop.value = '28x28'

vbox = widgets.VBox([w_image, w_conf, w_salience, w_drop])

display(vbox)

VBox(children=(Image(value=b'', height='224', layout="Layout(width='100%')", width='224'), FloatSlider(value=0…

In [2]:
# Load the model
from keras.models import Model
from keras.layers import *
from keras.applications import MobileNet
import keras.backend as K
import tensorflow as tf
import numpy as np

# This is the image "feature extractor" which learns features to decide what an image contains
# More about MobileNet here: https://arxiv.org/pdf/1704.04861.pdf
cnn = MobileNet(include_top=False,
                input_shape=(224, 224, 3),
                weights=None,
                alpha=0.25)
    
# Take the feature map output by the network (7 x 7 image with 256 channels) and reduce it to 1 x 1 * 256 with averaging
gap = GlobalAveragePooling2D(name='gap')(cnn.output)

# This is our prediction layer, which outputs a probability between 0 and 1 of whether an object is a cell phone or not.
cls = Dense(1, activation='sigmoid', name='cls')(gap)
cls_out = cls

# Create the model
model = Model(inputs=cnn.input, outputs=cls)

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
model.load_weights(WEIGHTS, by_name=True)
model.summary()

# We want to calculate the rate of change of the class variable with respect to the input image / intermediate layers
salience_224 = K.gradients(-K.mean(cls), model.input)[0]
salience_112 = K.gradients(-K.mean(cls), model.get_layer('conv_pw_1_relu').output)[0]
salience_56 = K.gradients(-K.mean(cls), model.get_layer('conv_pw_3_relu').output)[0]
salience_28 = K.gradients(-K.mean(cls), model.get_layer('conv_pw_5_relu').output)[0]
salience_14 = K.gradients(-K.mean(cls), model.get_layer('conv_pw_11_relu').output)[0]
salience_7 = K.gradients(-K.mean(cls), model.get_layer('conv_pw_13_bn').output)[0]

Using TensorFlow backend.
W0913 10:53:20.220420 4498757056 deprecation_wrapper.py:119] From /Users/carroll/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0913 10:53:20.244633 4498757056 deprecation_wrapper.py:119] From /Users/carroll/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0913 10:53:20.249761 4498757056 deprecation_wrapper.py:119] From /Users/carroll/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0913 10:53:20.267866 4498757056 deprecation_wrapper.py:119] From /Users/carroll/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_ses

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 225, 225, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 112, 112, 8)       216       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 112, 112, 8)       32        
_________________________________________________________________
conv1_relu (ReLU)            (None, 112, 112, 8)       0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 112, 112, 8)       72        
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 112, 112, 8)       32        
__________

In [17]:
import cv2  
cap = cv2.VideoCapture(DEVICE)


try:
    while True:
        # Have to call this to get update values from sliders / dropdowns
        ipython.kernel.do_one_iteration()
        
        # Read the frame from the camera
        ret, frame = cap.read()
                
        img_original = frame
        img_resize = cv2.resize(img_original, (224, 224))
        img_draw = img_resize.copy()
        
        """
        The network expects the image to be scaled between -1 and 1,
        but most images are scaled between 0 and 255 normally.
        
        We divide by 127.5 to scale between 0 and 2, and subtract one to
        be between -1 and 1
        """
        img_input = (img_resize / 127.5) - 1
        
        """
        The neural network expects a "batch" of images as an input
        This converts our single image with a shape of (224, 224, 3) to (1, 224, 224, 3)
        The 1 at the beginning is called the batch dimension
        """
        batch = np.expand_dims(img_input, axis=0)

        # Do the actual inference        
        cls = sess.run(cls_out, feed_dict={model.input: batch})
        
        # This is the probability that the image contains a cell phone
        w_conf.value=cls[0][0]
        
        # Should we visualize the important parts of the image for predicting the output class?
        if w_salience.value:
            
            if w_drop.value == '224x224':
                salience = sess.run(salience_224, feed_dict={model.input: batch})
            elif w_drop.value == '112x112':
                salience = sess.run(salience_112, feed_dict={model.input: batch})
            elif w_drop.value == '56x56':
                salience = sess.run(salience_56, feed_dict={model.input: batch})
            elif w_drop.value == '28x28':
                salience = sess.run(salience_28, feed_dict={model.input: batch})
            elif w_drop.value == '14x14':
                salience = sess.run(salience_14, feed_dict={model.input: batch})
            elif w_drop.value == '7x7':
                salience = sess.run(salience_7, feed_dict={model.input: batch})

            sal = np.linalg.norm(salience[0], axis=-1)
            sal = np.expand_dims(cv2.resize(sal, (224, 224)), axis=-1)

            sal = sal / np.max(sal)
            sal = (255*sal).astype(np.uint8)
            sal_alpha = sal/255
            
            sal = cv2.applyColorMap(sal, cv2.COLORMAP_VIRIDIS)
                        
            cls_add = img_draw.astype(np.float32) + (sal)*(sal_alpha).astype(np.float32)
            img_draw = (255*(cls_add / np.max(cls_add))).astype(np.uint8)
                
        result, img_png = cv2.imencode('.png', img_draw)
        w_image.value = img_png.tobytes()
            
        
except KeyboardInterrupt:
    pass
finally:
    cap.release()


