Formula for computing convolution layer output is:

* $W_2 = \frac{W_1 - F + 2P}{S} + 1$
* $H_2 = \frac{H_1 - F + 2P}{S} + 1$
* $D_2=K$

For:
* input of size $W_1 \times H_1 \times D_1$
* $K$ number of filters
* Stride of $S$
* Filter size of $F$
* $P$ amount of zero padding

In [1]:
import tensorflow as tf

In [2]:
print(tf.__version__)

2.0.0


**C3**

$C3: W_1 = 69, H_1 = 69, D_1 = 512$

Output of C3 after $1x1$ lateral conv ($F = 1, K = 256, S=1, P=0$): ($W_2 = 69, H_2 = 69, D_1 = 256$)

* $W_2 = \frac{69-1}{1} + 1 = 69$
* $H_2 = \frac{69-1}{1} + 1 = 69$
* $D_2 = 256$

Output of C3 after bilinear upsampling by a factor of 2: ($W_2 = 138, H_2 = 138, D_1 = 256$)

**C4**

$C4: W_1 = 35, H_1 = 35, D_1 = 1024$



In [30]:
class FeaturePyramidTopDownPath(tf.keras.layers.Layer):
    """
    FPN paper reference: https://arxiv.org/pdf/1612.03144.pdf
    Suggested upsampling by a factor of 2. Did not mention details
    about cropping the upsampled feature maps to enable elementwise
    adding. Perhaps in original FPN, upsampled feature maps are already
    of correct size.
    
    """
    def __init__(self, num_fpn_output_filters=256):
        super(FeaturePyramidTopDownPath, self).__init__()
        
        # use 1x1 conv to reduce num of channels of c5
        # need padding=same?
        self.c5_1by1_conv_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(1,1))
        # use 1x1 conv to reduce num of channels of c4
        self.c4_1by1_conv_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(1,1))
        # apply 1x1 conv to reduce num of channels of c3
        self.c3_1by1_conv_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(1,1))
        
        # upsample by a facor of 2
        # no parameters to learn, so one layer sufficient for both c4 and c5
        self.upsample_layer =  tf.keras.layers.UpSampling2D(size=(2, 2), interpolation='bilinear')
        
        # apply 3×3 convolution on each merged map togenerate the final feature map
        self.p5_3by3_conv_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(3,3))
        self.p4_3by3_conv_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(3,3))
        self.p3_3by3_conv_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(3,3))
        
        # paper mention using 3x3 conv and stride of 2 to downsample p5 to get p6
        # likewise for p7
        self.downsample_p5_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(3,3), strides=(2, 2))
        self.downsample_p6_layer = tf.keras.layers.Conv2D(filters=num_fpn_output_filters, kernel_size=(3,3), strides=(2, 2))
        
    def call(self, c3, c4, c5):
        c5_reduced_dim = self.c5_1by1_conv_layer(c5)
        #print(c5_reduced_dim.shape)
        upsampled_c5_reduced_dim = self.upsample_layer(c5_reduced_dim)
        
        c4_reduced_dim = self.c4_1by1_conv_layer(c4)
        upsampled_c4_reduced_dim = self.upsample_layer(c4_reduced_dim)
        
        c3_reduced_dim = self.c3_1by1_conv_layer(c3)
        
        print(c3_reduced_dim.shape)
        assert c3_reduced_dim.shape == (None, 69, 69, 256)
        
        p5 = self.p5_3by3_conv_layer(c5_reduced_dim)
        
        c4_add_c5 = self.crop_and_add(upsampled_c5_reduced_dim, c4_reduced_dim)
        p4 = self.p4_3by3_conv_layer(c4_add_c5)
        
        c3_add_c4 = self.crop_and_add(upsampled_c4_reduced_dim, c3_reduced_dim)
        p3 = self.p3_3by3_conv_layer(c3_add_c4)
        
        p6 = self.downsample_p5_layer(p5)
        p7 = self.downsample_p6_layer(p6)
        
        return [p3, p4, p5, p6, p7]
        
    def crop_and_add(self, x1, x2):
        """
        Follow TF implementation, where for example p5 is upsampled by twice and then a middle portion of it 
        that matches p4 is extracted and added to p4.
        
        This is how UNET does it
        https://tf-unet.readthedocs.io/en/latest/_modules/tf_unet/layers.html
        
        Will it be better to just resize p5 to the same size as p4? Less loss of information? But have to handle
        upsampling by float numbers?
        """
    
        x1_shape = x1.shape
        x2_shape = x2.shape
        #x1_shape = tf.shape(x1)
        #x2_shape = tf.shape(x2)
        
        # offsets for the top left corner of the crop
        # find the indices where the crop of x1 should be taken
        # with starting indices from the top left corner
        offsets = [0, (x1_shape[1] - x2_shape[1]) // 2, (x1_shape[2] - x2_shape[2]) // 2, 0]
        # the final size that x1 should be cropped into
        x1_cropped_size = [-1, x2_shape[1], x2_shape[2], -1]
        x1_crop = tf.slice(x1, offsets, x1_cropped_size)
        return tf.add(x1_crop, x2)

In [31]:
class Yolact(tf.keras.Model):

    def __init__(self, num_fpn_output_filters, input_shape=(550, 550, 3)):
        super(Yolact, self).__init__()

        resnet_50_backbone = tf.keras.applications.ResNet50(
                                input_shape=input_shape,
                                include_top=False,
                                layers=tf.keras.layers,
                                weights='imagenet'
                            )

        c3 = resnet_50_backbone.layers[80].output
        c4 = resnet_50_backbone.layers[142].output
        c5 = resnet_50_backbone.layers[174].output
        
        print(f"c3 shape: {c3.shape}")
        print(f"c4 shape: {c4.shape}")
        print(f"c5 shape: {c5.shape}")

        self.yolact_resnet_50_backbone = tf.keras.Model(
                                        inputs=resnet_50_backbone.input,
                                        outputs=[c3, c4, c5]
                                        )
        self.fpn_top_down = FeaturePyramidTopDownPath(num_fpn_output_filters)
        

    def call(self, inputs):
        c3, c4, c5 = self.yolact_resnet_50_backbone(inputs)
        print()
        fpn_output = self.fpn_top_down(c3, c4, c5)
        
        return fpn_output

In [32]:
yolact = Yolact(input_shape=(550, 550, 3), num_fpn_output_filters=256)

c3 shape: (None, 69, 69, 512)
c4 shape: (None, 35, 35, 1024)
c5 shape: (None, 18, 18, 2048)


In [33]:
yolact.build(input_shape=(None, 550, 550, 3))
yolact.summary()


(None, 69, 69, 256)


AssertionError: in converted code:

    <ipython-input-30-fc832d4c066c>:46 call  *
        assert c3_reduced_dim.shape == (None, 69, 69, 256)

    AssertionError: 


## Deprecated Code

In [3]:
resnet_50_backbone = tf.keras.applications.ResNet50(
                                input_shape=(550, 550, 3),
                                include_top=False,
                                weights='imagenet'
                            )

A local file was found, but it seems to be incomplete or outdated because the auto file hash does not match the original value of 4d473c1dd8becc155b73f8504c6f6626 so we will re-download the data.
Downloading data from https://github.com/keras-team/keras-applications/releases/download/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [6]:
resnet_50_backbone.summary()

Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 550, 550, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 556, 556, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 275, 275, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
conv1_bn (BatchNormalization)   (None, 275, 275, 64) 256         conv1_conv[0][0]                 
___________________________________________________________________________________________

In [7]:
c3 = resnet_50_backbone.layers[80].output
c4 = resnet_50_backbone.layers[142].output
c5 = resnet_50_backbone.layers[174].output

yolact_resnet_50_backbone = tf.keras.Model(
                                inputs=resnet_50_backbone.input,
                                outputs=[c3, c4, c5]
                            )

In [38]:
resnet_output_layers = [80, 142, 174]

conv3_block4_out = resnet_50_backbone.layers[80]
conv4_block6_out = resnet_50_backbone.layers[142]
conv4_block6_out = resnet_50_backbone.layers[174]

yolact_resnet_50_backbone = tf.keras.Model(inputs=resnet_50_backbone.input,
                                           outputs=[resnet_50_backbone.layers[layer_num].output for layer_num in resnet_output_layers])

In [33]:
resnet_50_backbone.layers[174].name

'activation_195'

In [26]:
def get_backbone():
    """Builds ResNet50 with pre-trained imagenet weights"""
    backbone = tf.keras.applications.ResNet50(
        include_top=False, input_shape=[None, None, 3]
    )
    c3_output, c4_output, c5_output = [
        backbone.get_layer(layer_name).output
        for layer_name in ["conv3_block4_out", "conv4_block6_out", "conv5_block3_out"]
    ]
    return keras.Model(
        inputs=[backbone.inputs], outputs=[c3_output, c4_output, c5_output]
    )

In [29]:
resnet50_backbone = get_backbone()

ValueError: No such layer: conv3_block4_out