In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using {} device".format(device))

Using cpu device


In [None]:
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, models
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt

The mobilenet v1 is implemented below.
Reference links: 
1. https://towardsdatascience.com/review-mobilenetv1-depthwise-separable-convolution-light-weight-model-a382df364b69

2. https://arxiv.org/pdf/1704.04861v1.pdf

For mobilnet v2:

```
mobilenet = models.mobilenet_v2(pretrained=True)
```
![img.png](https://drive.google.com/uc?export=view&id=1elEnrh_MAWIk1s5eOF7zffn-Fb0fWl45)

There is a mistake.. stride for the last depthwise_conv layer should be 1 and not 2. 

In [None]:
def standard_conv(input_channels , output_channels, stride):
    model = nn.Sequential(
        nn.Conv2d(
            in_channels=input_channels, 
            out_channels=output_channels, 
            kernel_size=3, 
            stride = stride, 
            padding = 1, 
            bias=False),
        nn.BatchNorm2d(num_features=output_channels),
        nn.ReLU(inplace=True)
    )
    return model

In [None]:
def depthwise_conv(input_channels , output_channels, stride):
    model = nn.Sequential(
        # depthwise convolution
        # this is ensured by making groups = input_channels
        nn.Conv2d(
            in_channels = input_channels, 
            out_channels = input_channels, 
            kernel_size=3, 
            stride = stride, 
            padding = 1, 
            groups=input_channels, 
            bias=False),
        nn.BatchNorm2d(num_features=input_channels),
        nn.ReLU(inplace=True),

        # pointwise convolution
        nn.Conv2d(
            in_channels = input_channels, 
            out_channels = output_channels, 
            kernel_size = 1, 
            stride = 1, 
            padding = 0, 
            bias=False),
        nn.BatchNorm2d(num_features=output_channels),
        nn.ReLU(inplace=True),
    )
    return model

In [None]:
class MobileNet(nn.Module):
    def __init__(self):
        super(MobileNet, self).__init__()
        self.conv_layers = nn.Sequential(
            standard_conv(input_channels = 3, output_channels = 32, stride = 2),
            depthwise_conv(input_channels = 32, output_channels = 64, stride = 1),
            depthwise_conv(input_channels = 64, output_channels = 128, stride = 2),
            depthwise_conv(input_channels = 128, output_channels = 128, stride = 1),
            depthwise_conv(input_channels = 128, output_channels = 256, stride = 2),
            depthwise_conv(input_channels = 256, output_channels = 256, stride = 1),
            depthwise_conv(input_channels = 256, output_channels = 512, stride = 2),
            depthwise_conv(input_channels = 512, output_channels = 512, stride = 1),
            depthwise_conv(input_channels = 512, output_channels = 512, stride = 1),
            depthwise_conv(input_channels = 512, output_channels = 512, stride = 1),
            depthwise_conv(input_channels = 512, output_channels = 512, stride = 1),
            depthwise_conv(input_channels = 512, output_channels = 512, stride = 1),
            depthwise_conv(input_channels = 512, output_channels = 1024, stride = 2),
            depthwise_conv(input_channels = 1024, output_channels = 1024, stride = 1),
            nn.AvgPool2d(kernel_size=7)
        )   
        self.fully_connected = nn.Linear(in_features=1024, out_features=1000)
        self.softmax = nn.Softmax(dim = 1) 

    def forward(self, x):
        '''
        # Debug Block (prints output size of each layer)
        for layer in self.conv_layers:
          x = layer(x)
          print(x.size())
        x = torch.reshape(x, (-1, 1024))
        print(x.size())
        x = self.fully_connected(x)
        print(x.size())
        x = self.softmax(x)
        print(x.size())

        # To run the block execute:
        # x = torch.randn(1, 3, 224, 224)
        # model(x)
        '''

        x = self.conv_layers(x)
        x = torch.reshape(x, (-1, 1024))
        x = self.fully_connected(x)
        x = self.softmax(x)
        
        return x

In [None]:
# returns the mobilenetv1 initialized using pretrained weights on imagenet. lr = 0.1 bs = 256
def load_pretrained_mobilenet():
    model = MobileNet().to(device)
    model_filename = 'drive/MyDrive/Saidl Assignment/model_best.pth.tar'
    checkpoint = torch.load(model_filename, map_location = device)
    pretrained_dict = checkpoint['state_dict']
    for key in list(pretrained_dict.keys()):
        if 'model.' in key:
            pretrained_dict[key.replace('module.model.', 'conv_layers.')] = pretrained_dict[key]
            del pretrained_dict[key]
        if 'module.fc.' in key:
            pretrained_dict[key.replace('module.fc.', 'fully_connected.')] = pretrained_dict[key]
            del pretrained_dict[key]

    model.load_state_dict(pretrained_dict)

    # Freeze mobilenet
    for p in model.parameters():
        p.requires_grad = False

    # remove the average pooling layer
    # now the output after the last layer will be of shape 7 x 7 x 1024
    model = nn.Sequential(*(model.conv_layers[i] for i in range(14)))

    return model

In [None]:
# increase the dimensions and decrease the number of channels using transpose convolution
def upsample_layer(input_channels, output_channels):
    stride = 2
    kernel_size = 5
    padding = 2
    output_padding = 1

    '''
    # approach 1: 
    # first depthwise transconv to increase dimensions, channels remain same
    # then pointwise to decrease channels, dims remain same
    '''
    model = nn.Sequential(
            # depthwise transconv, ensured using groups = channels
            nn.ConvTranspose2d(
                in_channels = input_channels,
                out_channels = input_channels,
                kernel_size = kernel_size,
                stride = stride,
                padding = padding,
                output_padding = output_padding,
                bias = False,
                groups = input_channels),
            nn.BatchNorm2d(num_features=input_channels),
            nn.ReLU(inplace=True),

            # pointwise
            nn.Conv2d(
                in_channels = input_channels, 
                out_channels = output_channels, 
                kernel_size = 1, 
                stride = 1, 
                padding = 0, 
                bias=False),
            nn.BatchNorm2d(num_features=output_channels),
            nn.ReLU(inplace=True)
        )
    
    '''
    # approach 2: 
    # normal transconv operation where dims and channel both are changed in one operation
    # this has more params than approach 1
    
    model = nn.Sequential(
            # depthwise transconv, ensured using groups = channels
            nn.ConvTranspose2d(
                in_channels = input_channels,
                out_channels = output_channels,
                kernel_size = kernel_size,
                stride = stride,
                padding = padding,
                output_padding = output_padding,
                bias=False),
            nn.BatchNorm2d(num_features=output_channels),
            nn.ReLU(inplace=True)
        )
    '''

    return model

In [None]:
def get_layer_activations(model, activation):
    def get_activation(name):
        def hook(model, input, output):
            activation[name] = output.detach()
        return hook

    for i in range(14):
        model[i].register_forward_hook(get_activation('conv_layers' + str(i)))
    '''
    # Debug Block: prints shape of each intermediate and final activation
    x = torch.randn(1, 3, 224, 224)
    output = model(x)
    for i in range(14):
      print(activation['conv_layers' + str(i)].shape)
    '''

In [None]:
class FastDepth(nn.Module):
    def __init__(self):
        super(FastDepth, self).__init__()
        # mobilenet
        self.mobilenet = load_pretrained_mobilenet()
        self.activation = {} 
        get_layer_activations(self.mobilenet, self.activation)

        # upsample layers
        self.upsample1 = upsample_layer(1024, 512)
        self.upsample2 = upsample_layer(512, 256)
        self.upsample3 = upsample_layer(256, 128)
        self.upsample4 = upsample_layer(128, 64)
        self.upsample5 = upsample_layer(64, 32)
        self.pointwise = nn.Conv2d(
          in_channels = 32, 
          out_channels = 1, 
          kernel_size = 1, 
          stride = 1, 
          padding = 0, 
          bias=False)
        
    def forward(self, x):
        x = self.mobilenet(x)
        x = self.upsample1(x)
        x = self.upsample2(x)
        x = self.upsample3(x + self.activation['conv_layers5'])
        x = self.upsample4(x + self.activation['conv_layers3'])
        x = self.upsample5(x + self.activation['conv_layers1'])
        x = self.pointwise(x)
        return x

In [None]:
model = FastDepth().to(device)
print(model)

FastDepth(
  (mobilenet): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 128, kernel_size=(1,

In [None]:
x = torch.randn(1, 3, 224, 224)
target = torch.randn(1, 1, 224, 224)
output = model(x)
print(output.shape)

In [None]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
optimizer.zero_grad()
output = model(x)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()

In [None]:
# Test Block
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook


model = load_pretrained_mobilenet()
for i in range(14):
    model[i].register_forward_hook(get_activation('conv_layers' + str(i)))
x = torch.randn(1, 3, 224, 224)
output = model(x)
for i in range(14):
  print(activation['conv_layers' + str(i)].shape)

torch.Size([1, 32, 112, 112])
torch.Size([1, 64, 112, 112])
torch.Size([1, 128, 56, 56])
torch.Size([1, 128, 56, 56])
torch.Size([1, 256, 28, 28])
torch.Size([1, 256, 28, 28])
torch.Size([1, 512, 14, 14])
torch.Size([1, 512, 14, 14])
torch.Size([1, 512, 14, 14])
torch.Size([1, 512, 14, 14])
torch.Size([1, 512, 14, 14])
torch.Size([1, 512, 14, 14])
torch.Size([1, 1024, 7, 7])
torch.Size([1, 1024, 7, 7])


In [None]:
# Dataset Load: NYU Depth

%matplotlib inline 
from matplotlib import pyplot as plt
import h5py
import numpy as np

In [None]:
filepath = 'nyu_depth_v2_labeled.mat'
arrays = {}
f = h5py.File(filepath)
#print(f[depths])
images = []
depths = []
for k, v in f.items():
    if k == 'images':
        images = v
    if k == 'depths':
        depths = v

In [None]:
depths = np.array(depths)
images = np.array(images)
print(images.shape)
print(depths.shape)

In [None]:
d = np.expand_dims(depths, axis=-1)
d = np.moveaxis(d, 3, 1)
print(d.shape)

In [None]:
im = np.moveaxis(images, 1, 3)
print(im.shape)

In [None]:
plt.imshow(depths[1], interpolation='nearest')
plt.show()