In [None]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.optim import Adam
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms
from torchvision.transforms import Compose, ToTensor, Normalize, Lambda
from torch.utils.data import DataLoader, Dataset
import numpy as np
from math import floor
import torchvision
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.autograd import Variable

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")





In [None]:
# The new version of FF - every layer has an untrainable predictor layer
# We train layer by layer

# The basic linear layer
class FFLinearLayer(nn.Linear):
    def __init__(self, in_features, out_features, layer_lr = 0.001,
                 num_classes = 10, num_epochs = 25,
                 bias=True, device=None, dtype=None):
        super().__init__(in_features, out_features, bias, device, dtype)
        self.relu = torch.nn.ReLU()
        self.opt = Adam(self.parameters(), lr=layer_lr)
        self.num_epochs = num_epochs
        self.predictor = nn.Linear(out_features, num_classes, device=device)
        self.predictor.requires_grad = False
    
    def forward(self, x):
        layerOutput = self.relu(super().forward(x))
        predictorOutput = self.predictor.forward(layerOutput)
        return layerOutput, predictorOutput
    
    def predict(self, x):
        layerOutput, predictorOutput = self.forward(x)
        return F.softmax(predictorOutput, dim=1)
    
    def trainLayer(self, dataloader, previousLayers):
        for epoch in range(self.num_epochs):
              criterion = nn.CrossEntropyLoss()
              for i, data in enumerate(dataloader):
                  originalInputs, labels = data
                  originalInputs = originalInputs.to(device)
                  labels = labels.to(device)
                  inputs = originalInputs
                  for previous in previousLayers:
                      if isinstance(previous, nn.MaxPool2d) or isinstance(previous, nn.Flatten):
                          inputs = previous.forward(inputs)
                      else:
                          inputs,_ = previous.forward(inputs)
                  self.opt.zero_grad()
                  layerOutput, predictorOutput = self.forward(inputs)
                  layerLoss = criterion(predictorOutput, labels)
                  # This is a local layer update, not a backprop through the net
                  layerLoss.backward()
                  self.opt.step()


In [None]:
# Helper function to dynamically compute the output size
def conv2d_output_size(input_size, out_channels, padding, kernel_size, stride, dilation=None):
    """According to https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
    """
    if dilation is None:
        dilation = (1, ) * 2
    if isinstance(padding, int):
        padding = (padding, ) * 2
    if isinstance(kernel_size, int):
        kernel_size = (kernel_size, ) * 2
    if isinstance(stride, int):
        stride = (stride, ) * 2

    output_size = (
        out_channels,
        np.floor((input_size[1] + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) /
                 stride[0] + 1).astype(int),
        np.floor((input_size[2] + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) /
                 stride[1] + 1).astype(int)
    )
    return output_size


# A convolutional Layer for FF

class FFConv2D(nn.Conv2d):
    def __init__(self, in_channels, out_channels, kernel_size,
                 sampleInput,
                 stride=1, padding=1, dilation=1, groups=1, 
                 num_epochs = 25, layer_lr = 0.001, num_classes = 10,
                 bias=True, padding_mode='zeros', device=None, dtype=None):
        super().__init__(in_channels, out_channels, kernel_size, stride, padding, 
                        dilation, groups, bias, padding_mode, device, dtype)
        self.opt = Adam(self.parameters(), lr=layer_lr)
        self.num_epochs = num_epochs
        self.num_classes = num_classes
        self.device = device
        self.getandSetPredictorWeightShape(sampleInput, device)

    def getandSetPredictorWeightShape(self, sampleInput, device):
        convOutput = F.relu(super().forward(sampleInput.to(self.device)))
        self.predictor = nn.Linear(convOutput.shape[1]*convOutput.shape[2]*convOutput.shape[3],
                                   self.num_classes, 
                                   device=device)
        self.predictor.requires_grad = False
    
    def conv_output_shape(self, h_w, kernel_size=1, stride=1, pad=0, dilation=1):
        if type(kernel_size) is not tuple:
            kernel_size = (kernel_size, kernel_size)
        h = floor( ((h_w[0] + (2 * pad) - ( dilation * (kernel_size[0] - 1) ) - 1 )/ stride) + 1)
        w = floor( ((h_w[1] + (2 * pad) - ( dilation * (kernel_size[1] - 1) ) - 1 )/ stride) + 1)
        return h, w

    def forward(self, x):
        convOutput = F.relu(super().forward(x))
        predInput = convOutput.view(convOutput.size(0), -1)
        predOutput = self.predictor(predInput)
        return convOutput, predOutput
    
    def predict(self, x):
        layerOutput, predictorOutput = self.forward(x)
        return F.softmax(predictorOutput, dim=1)
    
    def trainLayer(self, dataloader, previousLayers):
        for epoch in range(self.num_epochs):
              criterion = nn.CrossEntropyLoss()
              for i, data in enumerate(dataloader):
                  originalInputs, labels = data
                  originalInputs = originalInputs.to(device)
                  labels = labels.to(device)
                  inputs = originalInputs
                  for previous in previousLayers:
                      if isinstance(previous, nn.MaxPool2d) or isinstance(previous, nn.Flatten):
                          inputs = previous.forward(inputs)
                      else:
                          inputs,_ = previous.forward(inputs)
                  self.opt.zero_grad()
                  layerOutput, predictorOutput = self.forward(inputs)
                  layerLoss = criterion(predictorOutput, labels)
                  # This is a local layer update, not a backprop through the net
                  layerLoss.backward()
                  self.opt.step()


In [None]:

# The overall FF Network
class FFNet(torch.nn.Module):
    def __init__(self, device):
        super().__init__()
        self.layers = []
        self.pool = nn.MaxPool2d(2, 2)

        sample_input = torch.rand(1,3,32,32).to(device)
        self.conv1 = FFConv2D(3, 64, 3, sample_input, device=device)
        conv1_output_eg, eg_preds = self.conv1.forward(sample_input)
        pooledEg = self.pool.forward(conv1_output_eg)
        self.conv2 = FFConv2D(64, 128, 3, pooledEg, device=device)
        conv2_output_eg, eg_preds = self.conv2.forward(pooledEg)
        pooledEg = self.pool.forward(conv2_output_eg)
        self.conv3 = FFConv2D(128, 256, 3, pooledEg, device=device)
        conv3_output_eg, eg_preds = self.conv3.forward(pooledEg)
        self.conv4 = FFConv2D(256, 256, 3, conv3_output_eg, device=device)
        conv4_output_eg, eg_preds = self.conv4.forward(conv3_output_eg)
        pooledEg = self.pool.forward(conv4_output_eg)
        self.conv5 = FFConv2D(256, 512, 3, pooledEg, device=device)
        conv5_output_eg, eg_preds = self.conv5.forward(pooledEg)
        self.conv6 = FFConv2D(512, 512, 3, conv5_output_eg, device=device)
        conv6_output_eg, eg_preds = self.conv6.forward(conv5_output_eg)
        pooledEg = self.pool.forward(conv6_output_eg)
        self.conv7 = FFConv2D(512, 512, 3, pooledEg, device=device)
        conv7_output_eg, eg_preds = self.conv7.forward(pooledEg)
        self.conv8 = FFConv2D(512, 512, 3, conv7_output_eg, device=device)
        conv8_output_eg, eg_preds = self.conv7.forward(conv7_output_eg)
        pooledEg = self.pool.forward(conv8_output_eg)
        self.fc1 = FFLinearLayer(pooledEg.flatten(start_dim=1).shape[1], 4096, device=device) # was 16*5*5
        self.fc2 = FFLinearLayer(4096,4096, device=device) 
        self.fc3 = FFLinearLayer(4096,10, device=device)
        self.flat = nn.Flatten()
        self.layers = [self.conv1, 
                       self.pool,
                       self.conv2,
                       self.pool,
                       self.conv3,
                       self.conv4,
                       self.pool,
                       self.conv5,
                       self.conv6,
                       self.pool,
                       self.conv7,
                       self.conv8,
                       self.pool,
                       self.flat, 
                       self.fc1, 
                       self.fc2, 
                       self.fc3]
        self.trainableList = [self.conv1, 
                              self.conv2, 
                              self.conv3,
                              self.conv4,
                              self.conv5,
                              self.conv6,
                              self.conv7,
                              self.conv8,
                              self.fc1, 
                              self.fc2, 
                              self.fc3]
    
    def trainNet(self, dataloader):
        for layer in range(len(self.layers)):
            if self.layers[layer] not in self.trainableList:
                print("Skipping untrainable layer ", layer + 1)
            else:
                print("Training Layer", layer + 1)
                previousLayers = self.layers[:layer]
                self.layers[layer].trainLayer(dataloader, previousLayers)
    
    # Predict on a batch
    def predict(self, x):
        # get per layer logits
        layerPreds = []
        layerInput = x
        for layer in self.layers:
            if layer not in self.trainableList:
                layerInput = layer.forward(layerInput)
            else:
                layerOutput, layerPred = layer.forward(layerInput)
                # Get per layer softmax
                layerPreds.append(F.softmax(layerPred, dim=1))
                layerInput = layerOutput
        layerPreds = torch.stack(layerPreds)
        # Add up per layer softmax
        combinedPred = torch.sum(layerPreds, dim=0)
        finalPred = F.softmax(combinedPred, dim=1)
        return finalPred
    
    # Predict on a batch (better)
    def predictFinal(self, x):
        # get per layer logits
        layerPreds = []
        layerInput = x
        for layer in self.layers:
            if layer not in self.trainableList:
                layerInput = layer.forward(layerInput)
            else:
                layerOutput, layerPred = layer.forward(layerInput)
                # Get per layer softmax
                layerPreds.append(F.softmax(layerPred, dim=1))
                layerInput = layerOutput
        #layerPreds = torch.stack(layerPreds)
        # Add up per layer softmax
        #combinedPred = torch.sum(layerPreds, dim=0)
        finalPred = layerPreds[-1] #F.softmax(combinedPred, dim=1)
        return finalPred
    
    # Evaluate on loader
    def evaluate(self, loader):
        correct = 0
        total = 0
        with torch.no_grad():
            for data in loader:
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)
                preds = self.predictFinal(inputs) #self.predict(inputs)
                _, predicted = torch.max(preds, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        return correct/total


In [None]:
torch.manual_seed(1234)
net = FFNet(device)

In [None]:
class ILLSizeEstimator(object):

    def __init__(self, model, input_size=(1,3,32,32), bits=32):
        '''
        Estimates the size of PyTorch models in memory
        for a given input size
        '''
        self.model = model
        self.input_size = input_size
        self.bits = bits
        return

    def get_parameter_sizes(self):
        '''Get sizes of all parameters in `model`'''
        mods = self.model.layers
        print(len(mods))
        sizes = []
        predictors = []
        
        for i in range(len(mods)):
            m = mods[i]
            p = list(m.parameters())
            if isinstance(m, FFConv2D) or isinstance(m, FFLinearLayer):
              predictors.append(m.predictor.parameters())
            else:
              predictors.append(None)
            for j in range(len(p)):
                sizes.append(np.array(p[j].size()))
        print(predictors)
        # Add predictor of final layer
        sizes.append(np.array(predictors[-1].size()))
        param_sizes = sizes
        return param_sizes

    def get_output_sizes(self, active_layers):
        '''Run sample input through each layer to get output sizes'''
        input_ = torch.rand(self.input_size[0],self.input_size[1],
                            self.input_size[2],self.input_size[3]).to(device)
        mods = active_layers #self.model.layers #list(self.model.modules())
        out_sizes = []
        preds = None
        for i in range(len(mods)):
            m = mods[i]
            res = m.forward(input_)
            if type(res) == tuple:
              out = res[0]
              preds = res[1]
            else:
              out = res
            out_sizes.append(np.array(out.size()))
            input_ = out
        if preds is not None:
          out_sizes.append(np.array(preds.size()))
        out_sizes = out_sizes
        return out_sizes
    
    def get_param_sizes(self, active_layers):
        mods = active_layers
        #print(len(mods))
        sizes = []
        last_pred = None
        
        for i in range(len(mods)):
            m = mods[i]
            p = list(m.parameters())
            if isinstance(m, FFConv2D) or isinstance(m, FFLinearLayer):
              last_pred = m.predictor
            for j in range(len(p)):
              sizes.append(np.array(p[j].size()))

        #gradient for BP
        bp_size = sizes[-1]
        
        # Add predictor of final layer
        p = list(last_pred.parameters())
        for j in range(len(p)):
            sizes.append(np.array(p[j].size()))

        param_sizes = sizes
        return param_sizes, bp_size

    def calc_param_bits(self, param_sizes):
        '''Calculate total number of bits to store `model` parameters'''
        total_bits = 0
        for i in range(len(param_sizes)):
            s = param_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        param_bits = total_bits
        return param_bits
    
    def get_bit_sizes(self, arr):
        total_bits = 0
        for i in range(len(arr)):
            s = arr[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        return total_bits

    # Iterate through layers
    # If is trainable - get static size, and double param size of last layer
    def calc_max_bits(self):
        input_bits = np.prod(np.array(self.input_size))*self.bits
        all_layers = self.model.layers
        sizes_bits = []
        sizes_mb = []
        for i in range(len(all_layers)):
          if isinstance(all_layers[i], FFConv2D) or isinstance(all_layers[i], FFLinearLayer):
            active_layers = all_layers[:i+1]
            active_op_sizes = self.get_output_sizes(active_layers)
            active_param_sizes, backprop_size = self.get_param_sizes(active_layers)
            layer_bits = self.get_bit_sizes(active_op_sizes) + self.get_bit_sizes(active_param_sizes) + self.get_bit_sizes(backprop_size)
            layer_mb = (layer_bits/8)/(1024**2)
            sizes_bits.append(layer_bits)
            sizes_mb.append(layer_mb)

            
        return sizes_bits, sizes_mb

    def calc_forward_bits(self):
        '''Calculate bits to store forward and backward pass'''
        total_bits = 0
        for i in range(len(self.out_sizes)):
            s = self.out_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.forward_bits = (total_bits)
        return    
    
    def calc_backward_bits(self):
        '''Calculate bits to store forward and backward pass'''
        total_bits = 0
        for i in range(len(self.out_sizes)):
            s = self.out_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.backward_bits = (total_bits)
        return

    def calc_input_bits(self):
        '''Calculate bits to store input'''
        self.input_bits = np.prod(np.array(self.input_size))*self.bits
        return

    def estimate_size(self):
        '''Estimate model size in memory in megabytes and bits'''
        self.get_parameter_sizes()
        self.get_output_sizes()
        self.calc_param_bits()
        self.calc_forward_bits()
        self.calc_backward_bits()
        self.calc_input_bits()
        final_total_output = self.param_bits + self.forward_bits + self.backward_bits #self.param_bits + self.forward_bits + self.backward_bits + self.input_bits

        total_megabytes = (total/8)/(1024**2)
        return total_megabytes, total

In [None]:
sizeEstimator = ILLSizeEstimator(net)

In [None]:
bits, mb = sizeEstimator.calc_max_bits()

In [None]:
print("Max memory usage (ILL):", np.max(mb))

Max memory usage (ILL): 114.18643188476562


In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

class SizeEstimatorBP(object):

    def __init__(self, model, input_size=(1,3,32,32), bits=32):
        '''
        Estimates the size of PyTorch models in memory
        for a given input size
        '''
        self.model = model
        self.input_size = input_size
        self.bits = bits
        self.gradient_sizes = 0
        self.grad_bits =0 
        return
    
    def get_gradient_sizes(self):
        '''Get sizes of all gradients in `model`'''
        mods = self.model.layers #list(self.model.modules())
        print(len(mods))
        sizes = []
        for i in range(len(mods)):
            m = mods[i]
            p = list(m.parameters())
            for j in range(len(p)):
                if p[j].requires_grad:
                  sizes.append(np.array(p[j].size()))

        self.gradient_sizes = sizes
        return

    def get_parameter_sizes(self):
        '''Get sizes of all parameters in `model`'''
        mods = self.model.layers #list(self.model.modules())
        print(len(mods))
        sizes = []
        
        for i in range(len(mods)):
            m = mods[i]
            p = list(m.parameters())
            for j in range(len(p)):
                sizes.append(np.array(p[j].size()))

        self.param_sizes = sizes
        return

    def get_output_sizes(self):
        '''Run sample input through each layer to get output sizes'''
        input_ = Variable(torch.FloatTensor(*self.input_size), volatile=True)
        mods = self.model.layers #list(self.model.modules())
        out_sizes = []
        for i in range(len(mods)):
            m = mods[i]
            out = m(input_)
            out_sizes.append(np.array(out.size()))
            input_ = out

        self.out_sizes = out_sizes
        return

    def calc_param_bits(self):
        '''Calculate total number of bits to store `model` parameters'''
        total_bits = 0
        for i in range(len(self.param_sizes)):
            s = self.param_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.param_bits = total_bits
        return

    def calc_gradient_bits(self):
        '''Calculate total number of bits to store `model` parameters'''
        total_bits = 0
        for i in range(len(self.gradient_sizes)):
            s = self.param_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.grad_bits = total_bits
        return

#     def calc_forward_backward_bits(self):
#         '''Calculate bits to store forward and backward pass'''
#         total_bits = 0
#         for i in range(len(self.out_sizes)):
#             s = self.out_sizes[i]
#             bits = np.prod(np.array(s))*self.bits
#             total_bits += bits
#         # multiply by 2 for both forward AND backward
#         self.forward_backward_bits = (total_bits*2)
#         return
    def calc_forward_bits(self):
        '''Calculate bits to store forward and backward pass'''
        total_bits = 0
        for i in range(len(self.out_sizes)):
            s = self.out_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.forward_bits = (total_bits)
        return    
    
    def calc_backward_bits(self):
        '''Calculate bits to store forward and backward pass'''
        total_bits = 0
        for i in range(len(self.out_sizes)):
            s = self.out_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.backward_bits = (total_bits)
        return

    def calc_input_bits(self):
        '''Calculate bits to store input'''
        self.input_bits = np.prod(np.array(self.input_size))*self.bits
        return

    def to_mb(self, bitcount):
      return (bitcount/8)/(1024**2)

    def estimate_size(self):
        '''Estimate model size in memory in megabytes and bits'''
        self.get_parameter_sizes()
        self.get_gradient_sizes()
        self.get_output_sizes()
        self.calc_param_bits()
        self.calc_forward_bits()
        self.calc_backward_bits()
        self.calc_gradient_bits()
        self.calc_input_bits()

        param_mb = self.to_mb(self.param_bits)
        print("Params:", param_mb)

        forward_mb = self.to_mb(self.forward_bits)
        print("Forward:", forward_mb)

        #backward_mb = self.to_mb(self.backward_bits)
        #print("Backward pass:", backward_mb)

        grad_mb = self.to_mb(self.grad_bits)
        print("Gradients:", grad_mb)

        return param_mb + forward_mb + grad_mb

In [None]:
class VGG11NetBP(nn.Module):
    def __init__(self):
        super().__init__()

        self.layers = [nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(in_features=512, out_features=4096),
            nn.ReLU(),
            nn.Linear(in_features=4096, out_features=4096),
            nn.ReLU(),
            nn.Linear(in_features=4096, out_features=10)
            ]

In [None]:
bpnet = VGG11NetBP()


In [None]:
bpsize_estimator = SizeEstimatorBP(bpnet)
total = bpsize_estimator.estimate_size()

27
27
Params: 107.36087799072266
Forward: 1.3398818969726562
Gradients: 107.36087799072266


  input_ = Variable(torch.FloatTensor(*self.input_size), volatile=True)


In [None]:
print("Max usage:", total)

Max usage: 216.06163787841797
