## Deep Learning Course

### Homework 2

#### Anastasiia Kasprova

    Link to github: https://github.com/kasprova/DL_UCU/tree/master/tasks/hw2
    Link to colab: https://colab.research.google.com/drive/1dYLb2ZRpyKemuOmglM5rKUoEhXJftWa2

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from __future__ import print_function
import argparse
import numpy as np
import torch.optim as optim
from torchvision import datasets, transforms
import time

#### 1. Custom Functions Implementation (simple_conv_net_func.py)

In [2]:
def diff_mse(x, y):
    x_vec = x.view(1, -1).squeeze()
    y_vec = y.view(1, -1).squeeze()
    return torch.mean(torch.pow((x_vec - y_vec), 2)).item()
  

def conv2d_scalar(x_in, conv_weight, conv_bias, device):
    
    #read dimentionas of input tensor and weights
    batch_size, n_channels_in, height_in, width_in = x_in.shape
    n_channels_out, n_channels_in, kernel_size, kernel_size = conv_weight.shape
    
    #calculate the dimentions of output tensor
    height_out = height_in - kernel_size + 1
    width_out = width_in - kernel_size + 1
    
    #move to device
    x_in = x_in.to(device)
    conv_weight = conv_weight.to(device)
    conv_bias = conv_bias.to(device)
    
    #intiale output tensor of the correct size
    z = torch.zeros([batch_size,n_channels_out,height_out,width_out]).to(device)
    z.requires_grad = True
    
    #fulfill z based on scalar representation
    for n in range(batch_size):
        for c_out in range(n_channels_out):
            for c_in in range(n_channels_in):
                for m in range(height_out):
                    for l in range(width_out):
                        z[n,c_out,m,l] = (x_in[n,c_in,m:m+kernel_size,l:l+kernel_size]*conv_weight[c_out,c_in]).sum() + conv_bias[c_out]
                                                                                                                                                                                                                                                                                                                                                          
    return z

  
def im2col(X, kernel_size, device, stride = 1):
  
    #read dimentions of input tensor - 3-dimentional
    C_in, S_in, S_in = X.shape

    #calculate size_out
    S_out = (S_in - kernel_size)//stride + 1
    
    #move to device
    X = X.to(device)
    
    #intiale output tensor of the correct size
    X_cols = torch.zeros([S_out*S_out, kernel_size*kernel_size]).to(device)
    X_cols.requires_grad = True
    
    for i in range(S_out):
        for j in range(S_out):
            X_cols[i*S_out+j] = X[0][i: i + kernel_size, j: j + kernel_size].contiguous().view(1, -1)
    
    return X_cols.t() # [K*K x S_out*S_out]

  
def conv_weight2rows(conv_weight):
    
    ##read dimentions of input tensor
    C_out = conv_weight.shape[0]
    kernel_size = conv_weight.shape[2]
    
    #resize 
    conv_weight_rows = conv_weight.view(C_out,kernel_size*kernel_size).contiguous()
    
    return conv_weight_rows # [C_out x K*K]
  

def conv2d_vector(x_in, conv_weight, conv_bias, device):
    #read dimentionas of input tensor and weights
    batch_size, C_in, S_in, S_in = x_in.shape
    C_out, C_in, kernel_size, kernel_size = conv_weight.shape
    
    #calculate the dimentions of output tensor
    S_out = S_in - kernel_size + 1
    
    #move to device
    x_in = x_in.to(device)
    conv_weight = conv_weight.to(device)
    conv_bias = conv_bias.to(device)
    
    #intiale output tensor of the correct size
    z = torch.zeros([batch_size,C_out,S_out,S_out]).to(device)
    z.requires_grad = True
    
    #transformation of conv_weight
    conv_weight_rows = conv_weight2rows(conv_weight)
    
    for n in range(batch_size):
        #WconvX+b, dim(WconvX+b)=[C_out x S_out*S_out], reshape [C_out x S_out x S_out]
        z[n] = (conv_weight_rows.matmul(im2col(x_in[n], kernel_size, device, stride=1)) + conv_bias.view(-1,1)).view(C_out,S_out,S_out)
    
    return z

  
def pool2d_scalar(a, device, stride = 2):
    
    #read dimentionas of input tensor
    batch_size, n_channels_in, height_in, width_in = a.shape
    pooling_size = 2
    
    #calculate the dimentions of output tensor
    height_out = (height_in-pooling_size)//stride + 1
    width_out = (width_in-pooling_size)//stride + 1
    n_channels_out = n_channels_in
    
    #move to device
    a = a.to(device)
    
    #intiale an output tensor of the correct size
    z = torch.zeros([batch_size,n_channels_out,height_out,width_out]).to(device)
    z.requires_grad = True
    
    #fulfill z based on scalar representation
    for n in range(batch_size):
        for c_out in range(n_channels_out):
            for i in range(height_out):
                for j in range(width_out):
                    z[n,c_out,i,j] = a[n,c_out,2*i:2*i+2,2*j:2*j+2].max()
    
    return z
   
def pool2d_vector(a, device, stride = 2):
    
    #read dimentionas of input tensor
    batch_size, C_in, S_in, S_in = a.shape
    pooling_size = 2
    stride = 2
    
    #calculate the dimentions of output tensor
    S_out = (S_in - pooling_size)//stride + 1 
    C_out = C_in
    
    #move to device
    a = a.to(device)
    
    #intiale an output tensor of the correct size
    z = torch.zeros([batch_size,C_out,S_out,S_out]).to(device)
    z.requires_grad = True
    
    for n in range(batch_size):
        z[n] = im2col(a[n], pooling_size, device, stride=2).max(dim=0).values.view(-1, S_out, S_out)
    return z 
  

def relu_scalar(a, device):
  
    #read dimentionas of input matrix
    batch_size, n_inputs = a.shape
    
    #move to device
    a = a.to(device)
    
    #intiale an output matrix of the correct size
    z = torch.zeros([batch_size, n_inputs]).to(device)
    z.requires_grad = True
    
    for n in range(batch_size):
        for i in range(n_inputs):
            if a[n,i]<0:
                z[n,i]=0
            else:
                z[n,i]=a[n,i]
                
    return z
  

def relu_vector(a, device):
    #move to device
    a = a.to(device)
    
    #clone input tensor
    z = a.clone().to(device)
    
    #elements < 0 replace with 0
    z[z<0] = 0
    
    return z

  
def reshape_scalar(a, device):
    
    #read dimentionas of input tensor
    batch_size, n_channels_in, height_in, width_in = a.shape
    
    #calculate the dimentions of output tensor
    n_outputs = n_channels_in * height_in * width_in
    
    #move to device
    a = a.to(device)
    
    #intiale an output matrix of the correct size
    z = torch.zeros([batch_size, n_outputs]).to(device)
    z.requires_grad = True
    
    for n in range(batch_size):
        for c_in in range(n_channels_in):
            for m in range(height_in):
                for l in range(width_in):
                    z[n,c_in*height_in*width_in+m*height_in+l] = a[n,c_in,m,l]
    
    return z
  
def reshape_vector(a, device):
    
    batch_size = a.shape[0]
    
    #move to device
    a = a.to(device)
    
    z = a.clone().view(batch_size,-1).requires_grad_(True)
    
    return z
  
def fc_layer_scalar(a, weight, bias, device):
    
    #read dimentionas of input matrix
    batch_size, n_inputs = a.shape
    n_outputs = bias.shape[0]
    
    #move to device
    a = a.to(device)
    weight = weight.to(device)
    bias = bias.to(device)
    
    #intiale an output matrix of the correct size
    z = torch.zeros([batch_size, n_outputs]).to(device)
    z.requires_grad = True
    
    for n in range(batch_size):
        for j in range(n_outputs):
            z[n,j] = bias[j]
            for i in range(n_inputs):
                z[n,j] += weight[j,i]*a[n,i]
                
    return z
  
    
def fc_layer_vector(a, weight, bias, device):
    
    z = (a.matmul(weight.t())+ bias).clone().requires_grad_(True)
    
    return z

#### 2. Check bellow functions (forward pass) using MSE metric whether their outputs are exactly the same as in Pytorch framework:
    conv2d_scalar
    pool2d_scalar
    relu_scalar
    fc_layer_scalar

In [15]:
class SimpleConvNet(nn.Module):
    def __init__(self, device):
        super(SimpleConvNet, self).__init__()
        self.device = device
        self.conv_layer = nn.Conv2d(in_channels=1,
                                    out_channels=20,
                                    kernel_size=5,
                                    stride=1,
                                    padding=0,
                                    dilation=1,
                                    groups=1,
                                    bias=True)

        self.fc_layer1 = nn.Linear(in_features=20 * 12 * 12, out_features=500)
        self.fc_layer2 = nn.Linear(in_features=500, out_features=10)
        self.to(device)

#### Comments: Since training with MNIST data crashes the os when using 'cpu', the following testing will be done with a toy generated data.

In [7]:
# Reproducibility
torch.manual_seed(17)
np.random.seed(17)
device = 'cpu'

In [16]:
#initialization
model = SimpleConvNet(device)
model

SimpleConvNet(
  (conv_layer): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc_layer1): Linear(in_features=2880, out_features=500, bias=True)
  (fc_layer2): Linear(in_features=500, out_features=10, bias=True)
)

In [18]:
#conv_layer
data = torch.rand([1,1,28,28])
w_conv = torch.rand([20,1,5,5])
b_conv = torch.rand([20])

#fc1_layer
w_fc1 = torch.rand([500, 2880])
b_fc1 = torch.rand([500])

#fc2_layer
w_fc2 = torch.rand([10, 500])
b_fc2 = torch.rand([10])

#### conv2d_scalar

In [19]:
z_conv2d_torch = model.conv_layer(data)
z_conv2d_scalar = conv2d_scalar(data, model.conv_layer.weight, model.conv_layer.bias, device)

mse_conv2d_scalar = diff_mse(z_conv2d_torch,z_conv2d_scalar)

In [35]:
print("MSE difference between conv2d_scalar output and conv_layer torch native: ", mse_conv2d_scalar)

MSE difference between conv2d_scalar output and conv_layer torch native:  1.0867831877549067e-15


#### pool2d_scalar

In [21]:
z_pool2d_torch = F.max_pool2d(z_conv2d_torch, 2, 2)
z_pool2d_scalar = pool2d_scalar(z_conv2d_scalar, device)

mse_pool2d_scalar = diff_mse(z_pool2d_torch,z_pool2d_scalar)

In [22]:
print("MSE difference between pool2d_scalar output and max_pool2d torch native: ", mse_pool2d_scalar)

MSE difference between pool2d_scalar output and max_pool2d torch native:  1.0151745988775545e-15


#### reshape_scalar

In [23]:
z_pool2d_reshape_torch = z_pool2d_torch.view(-1, 20 * 12 * 12)
z_pool2d_reshape_scalar = reshape_scalar(z_pool2d_scalar, device)

mse_reshape_scalar = diff_mse(z_pool2d_reshape_torch,z_pool2d_reshape_scalar)

In [24]:
print("MSE difference between reshape_scalar output and torch.view(-1, 20 * 12 * 12): ", mse_reshape_scalar)

MSE difference between reshape_scalar output and torch.view(-1, 20 * 12 * 12):  1.0151745988775545e-15


#### fc_layer_scalar

In [25]:
z_fc1_torch = model.fc_layer1(z_pool2d_reshape_torch)
z_fc1_scalar = fc_layer_scalar(z_pool2d_reshape_scalar, model.fc_layer1.weight, model.fc_layer1.bias, device)

mse_fc1_scalar = diff_mse(z_fc1_torch,z_fc1_scalar)

In [26]:
print("MSE difference between fc_layer_scalar output and fc_layer1 torch native: ", mse_fc1_scalar)

MSE difference between fc_layer_scalar output and fc_layer1 torch native:  4.632291390207176e-14


#### 3. Compare the performance of  scalar and vector implementations

In [33]:
start_scalar = time.time()
#scalar
z_conv = conv2d_scalar(data, w_conv, b_conv, device)
z_pool = pool2d_scalar(z_conv, device)
z_pool_reshaped = reshape_scalar(z_pool, device)
z_fc1 = fc_layer_scalar(z_pool_reshaped, w_fc1, b_fc1, device)
z_relu = relu_scalar(z_fc1, device)
z_fc2 = fc_layer_scalar(z_relu, w_fc2, b_fc2, device)
y = F.softmax(z_fc2, dim=1)
end_scalar = time.time()

print(end_scalar-start_scalar, "sec")

100.0195803642273 sec


In [34]:
start_vector = time.time()
#vector
z_conv = conv2d_vector(data, w_conv, b_conv, device)
z_pool = pool2d_vector(z_conv, device)
z_pool_reshaped = reshape_vector(z_pool, device)
z_fc1 = fc_layer_vector(z_pool_reshaped, w_fc1, b_fc1, device)
z_relu = relu_vector(z_fc1, device)
z_fc2 = fc_layer_vector(z_relu, w_fc2, b_fc2, device)
y = F.softmax(z_fc2, dim=1)

end_vector = time.time()

print(end_vector-start_vector, "sec")

0.09873318672180176 sec


#### Comments: Vector implementation performs in more than 100 times faster.