<a href="https://colab.research.google.com/github/krishnajakodali/ml_lab3/blob/main/mlcodsgn_lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import easydict
from torchsummary import summary

# argument parser
import easydict

args = easydict.EasyDict({
        "batch_size": 32,
        "epochs": 10,
        "lr": 0.01,
})
# Hyper Parameters
input_size = 784
num_classes = 10
num_epochs = args.epochs
batch_size = args.batch_size
learning_rate = args.lr

# MNIST Dataset (Images and Labels)
train_set = dsets.FashionMNIST(
    root = './data/FashionMNIST',
    train = True,
    download = True,
    transform = transforms.Compose([
        transforms.ToTensor()
    ])
)
test_set = dsets.FashionMNIST(
    root = './data/FashionMNIST',
    train = False,
    download = True,
    transform = transforms.Compose([
        transforms.ToTensor()
    ])
)


# Dataset Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset = train_set,
        batch_size = batch_size,
        shuffle = True)

test_loader = torch.utils.data.DataLoader(dataset = test_set,
        batch_size = batch_size,
        shuffle = False)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26.4M/26.4M [00:02<00:00, 9.40MB/s]


Extracting ./data/FashionMNIST/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29.5k/29.5k [00:00<00:00, 200kB/s]


Extracting ./data/FashionMNIST/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4.42M/4.42M [00:01<00:00, 3.68MB/s]


Extracting ./data/FashionMNIST/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5.15k/5.15k [00:00<00:00, 8.89MB/s]

Extracting ./data/FashionMNIST/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw






In [3]:
class MyConvNet(nn.Module):
    def __init__(self, args):
        super(MyConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.act1  = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.act2  = nn.ReLU(inplace=True)
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        self.lin2  = nn.Linear(7*7*32, 10, bias=False)

    def forward(self, x):
        x = self.conv1(x)
        x = self.act1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.act2(x)
        x = self.pool2(x)
        x = x.view(x.size(0), -1)
        x = self.lin2(x)
        return x

model = MyConvNet(args)
model = model.cuda()

criterion = nn.CrossEntropyLoss()
criterion=criterion.cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 5e-4, momentum=0.9)


In [4]:
summary(model, (1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 28, 28]             144
              ReLU-2           [-1, 16, 28, 28]               0
         MaxPool2d-3           [-1, 16, 14, 14]               0
            Conv2d-4           [-1, 32, 14, 14]           4,608
              ReLU-5           [-1, 32, 14, 14]               0
         MaxPool2d-6             [-1, 32, 7, 7]               0
            Linear-7                   [-1, 10]          15,680
Total params: 20,432
Trainable params: 20,432
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.32
Params size (MB): 0.08
Estimated Total Size (MB): 0.40
----------------------------------------------------------------


In [5]:
print("---Training started")
# Training the Model
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.cuda()
        labels = Variable(labels).cuda()

        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        L1norm = model.parameters()
        arr = []
        for name,param in model.named_parameters():
          if 'weight' in name.split('.'):
            arr.append(param)
        L1loss = 0
        for Losstmp in arr:
          L1loss = L1loss+Losstmp.abs().mean()

        loss.backward()
        optimizer.step()

        if (i + 1) % 600 == 0:
            print('Epoch: [% d/% d], Step: [% d/% d], Loss: %.4f'
                    % (epoch + 1, num_epochs, i + 1,
                       len(train_set) // batch_size, loss.data.item()))

correct = 0
total = 0
model.eval()
for images, labels in test_loader:
    images = images.cuda()
    labels = labels.cuda()
    outputs = model(images)
    testloss = criterion(outputs, labels)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()

print('Accuracy for test images: % d %%' % (100 * correct / total))

---Training started
Epoch: [ 1/ 10], Step: [ 600/ 1875], Loss: 0.1728
Epoch: [ 1/ 10], Step: [ 1200/ 1875], Loss: 0.3289
Epoch: [ 1/ 10], Step: [ 1800/ 1875], Loss: 0.1153
Epoch: [ 2/ 10], Step: [ 600/ 1875], Loss: 0.1044
Epoch: [ 2/ 10], Step: [ 1200/ 1875], Loss: 0.5728
Epoch: [ 2/ 10], Step: [ 1800/ 1875], Loss: 0.2086
Epoch: [ 3/ 10], Step: [ 600/ 1875], Loss: 0.3175
Epoch: [ 3/ 10], Step: [ 1200/ 1875], Loss: 0.1801
Epoch: [ 3/ 10], Step: [ 1800/ 1875], Loss: 0.4170
Epoch: [ 4/ 10], Step: [ 600/ 1875], Loss: 0.1844
Epoch: [ 4/ 10], Step: [ 1200/ 1875], Loss: 0.1444
Epoch: [ 4/ 10], Step: [ 1800/ 1875], Loss: 0.3252
Epoch: [ 5/ 10], Step: [ 600/ 1875], Loss: 0.2698
Epoch: [ 5/ 10], Step: [ 1200/ 1875], Loss: 0.3423
Epoch: [ 5/ 10], Step: [ 1800/ 1875], Loss: 0.3684
Epoch: [ 6/ 10], Step: [ 600/ 1875], Loss: 0.2713
Epoch: [ 6/ 10], Step: [ 1200/ 1875], Loss: 0.1629
Epoch: [ 6/ 10], Step: [ 1800/ 1875], Loss: 0.3022
Epoch: [ 7/ 10], Step: [ 600/ 1875], Loss: 0.5730
Epoch: [ 7/ 10], S

In [6]:
torch.save(model.state_dict(), './madu_saved_lab4.pt')

In [7]:
load_model = MyConvNet(args)
load_model.load_state_dict(torch.load('./madu_saved_lab4.pt'))

load_model = load_model.cuda()
correct = 0
total = 0
load_model.eval()
for images, labels in test_loader:
    images = images.cuda()
    labels = labels.cuda()
    outputs = load_model(images)
    testloss = criterion(outputs, labels)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()

print('Accuracy for test images: % .2f %%' % (100 * correct / total))

  load_model.load_state_dict(torch.load('./madu_saved_lab4.pt'))


Accuracy for test images:  90.32 %


In [8]:

def USquantize(x,bits=4):
  # Uniform symetric quantizer that
  # quantize x into sf * qx
  # sf: scaling factor
  # qx: integer in range [-2^(bits-1)+1, 2^(bits-1)-1]
  # note: only 2^bits - 1 different values can be represented, bits >= 2

  max_value = torch.max(torch.abs(x))
  sf = max_value / (2**(bits-1) - 1)
  qx = torch.round(x/sf)
  qx = torch.clip(qx,min=-2**(bits-1) + 1,max=2**(bits-1) - 1)
  dqx = qx * sf
  return dqx

def UASquantize(x,bits=4):
  # Uniform Asymetric quantizer that
  # quantize x into min_value + sf * qx
  # sf: scaling factor
  # qx: integer in range [0,2^bits-1]
  # note: 2^bits different values can be represented, bits >= 1

  max_value = torch.max(x)
  min_value = torch.min(x)
  sf = (max_value - min_value) / (2**bits - 1)
  qx = torch.round((x-min_value)/sf)
  qx = torch.clip(qx,min=0,max=2**(bits) - 1)
  dqx = min_value + qx * sf
  return dqx

def UASquantizeMinOffset(x,p,bits=4):
  # Uniform Asymetric quantizer that
  # quantize x into min_value + sf * qx
  # sf: scaling factor
  # p: percent offset away from the native min. Can be negative or positive
  # qx: integer in range [0,2^bits-1]
  # note: 2^bits different values can be represented, bits >= 1

  max_value = torch.max(x)
  min_value = torch.min(x)
  # update min_value to be set by p
  min_value = min_value * p
  sf = (max_value - min_value) / (2**bits - 1)
  qx = torch.round((x-min_value)/sf)
  qx = torch.clip(qx,min=0,max=2**(bits) - 1)
  dqx = min_value + qx * sf
  return dqx


def USquantize_clipped(x,bits=4,quantile=0.999):

  # Uniform Symetric quantizer with clipped representation range
  # range covers 'quantile' percent of FP32 x
  max_value = torch.quantile(x, 1 - 0.5 * (1 - quantile))  # Upper quantile
  min_value = torch.quantile(x, 0.5 * (1 - quantile))      # Lower quantile
  quantile_max = max(abs(max_value), abs(min_value))  # Ensure symmetry around zero
  sf = quantile_max / (2 ** (bits - 1) - 1)
  qx = torch.round(x/sf)
  qx = torch.clip(qx,min=-2**(bits-1) + 1,max=2**(bits-1) - 1)
  dqx = qx * sf
  return dqx

def UASquantize_clipped(x,bits=4,quantile=0.999):

  # Uniform Asymetric quantizer with clipped representation range
  # range covers 'quantile' percent of FP32 x

  max_value = torch.quantile(x,1-0.5*(1-quantile))
  min_value = torch.quantile(x,0.5*(1-quantile))
  sf = (max_value - min_value) / (2**bits - 1)
  qx = torch.round((x-min_value)/sf)
  qx = torch.clip(qx,min=0,max=2**(bits) - 1)
  dqx = min_value + qx * sf
  return dqx

## ***SYMMETRIC SIGNED QUANTIZATION***

1.   *List item*
2.   List item



In [9]:
from copy import deepcopy
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()
cuda_available =torch.cuda.is_available()

# weight quantization only
# different bitwidth
for n_bits in (4,8, 12, 16):
  print(f'quantizing model into {n_bits} bits')
  for name,_ in model.named_parameters():
    # print('quantizing ',name)
    q_sd[name] = USquantize(sd[name],bits=n_bits)

  q_model.load_state_dict(q_sd)

  # Test the Model
  correct = 0
  total = 0
  for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))

quantizing model into 4 bits
Accuracy on the 10000 test images:  90.10 %
quantizing model into 8 bits
Accuracy on the 10000 test images:  90.35 %
quantizing model into 12 bits
Accuracy on the 10000 test images:  90.34 %
quantizing model into 16 bits
Accuracy on the 10000 test images:  90.32 %


# **ASSYMETRIC QUANTIZATION**

In [22]:
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()
cuda_available =torch.cuda.is_available()
# weight quantization only
# different bitwidth
for n_bits in (4,8, 12, 16):
  print(f'quantizing model into {n_bits} bits')
  for name,_ in model.named_parameters():
    # print('quantizing ',name)
    q_sd[name] = UASquantize(sd[name],bits=n_bits)

  q_model.load_state_dict(q_sd)

  # Test the Model
  correct = 0
  total = 0
  for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))

quantizing model into 4 bits
Accuracy on the 10000 test images:  90.13 %
quantizing model into 8 bits
Accuracy on the 10000 test images:  90.33 %
quantizing model into 12 bits
Accuracy on the 10000 test images:  90.34 %
quantizing model into 16 bits
Accuracy on the 10000 test images:  90.32 %


# **OPTIMAL RANGE QUANTIZATION TO REMOVE OUTLIERS**

symmetric

In [11]:
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()
cuda_available =torch.cuda.is_available()

# weight quantization only
# different bitwidth
for n_bits in (4,8, 12, 16):
  print(f'quantizing model into {n_bits} bits')
  for name,_ in model.named_parameters():
    # print('quantizing ',name)
    q_sd[name] = USquantize_clipped(sd[name],bits=n_bits)
  q_model.load_state_dict(q_sd)

  # Test the Model
  correct = 0
  total = 0
  for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))

quantizing model into 4 bits
Accuracy on the 10000 test images:  90.06 %
quantizing model into 8 bits
Accuracy on the 10000 test images:  90.24 %
quantizing model into 12 bits
Accuracy on the 10000 test images:  90.20 %
quantizing model into 16 bits
Accuracy on the 10000 test images:  90.20 %


# **OPTIMAL RANGE QUANTIZATION TO REMOVE OUTLIERS**

assymetric
:

In [12]:
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()
cuda_available =torch.cuda.is_available()

# weight quantization only
# different bitwidth
for n_bits in (4,8, 12, 16):
  print(f'quantizing model into {n_bits} bits')
  for name,_ in model.named_parameters():
    # print('quantizing ',name)
    q_sd[name] = UASquantize_clipped(sd[name],bits=n_bits)
  q_model.load_state_dict(q_sd)

  # Test the Model
  correct = 0
  total = 0
  for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))

quantizing model into 4 bits
Accuracy on the 10000 test images:  90.31 %
quantizing model into 8 bits
Accuracy on the 10000 test images:  90.19 %
quantizing model into 12 bits
Accuracy on the 10000 test images:  90.22 %
quantizing model into 16 bits
Accuracy on the 10000 test images:  90.22 %


# Per layer symmetric

In [13]:
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()

# Weight quantization only for different bitwidths
for n_bits in (4, 8, 12, 16):
    print(f'Quantizing model into {n_bits}-bit weights')

    for name, param in model.named_parameters():
        if 'weight' in name:  # Quantize only the weights, not biases
            # Print layer being quantized
            #print(f'Quantizing layer: {name}')

            # Apply per-layer quantization (can use USquantize or per_layer_quantize function)
            q_sd[name] = USquantize(sd[name], bits=n_bits)

    # Load the quantized weights into the model
    q_model.load_state_dict(q_sd)

    # Test the Model on the test set
    correct = 0
    total = 0
    for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))


Quantizing model into 4-bit weights
Accuracy on the 10000 test images:  90.10 %
Quantizing model into 8-bit weights
Accuracy on the 10000 test images:  90.35 %
Quantizing model into 12-bit weights
Accuracy on the 10000 test images:  90.34 %
Quantizing model into 16-bit weights
Accuracy on the 10000 test images:  90.32 %


# Per layer - Assymetric

In [14]:
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()

# Weight quantization only for different bitwidths
for n_bits in (4, 8, 12, 16):
    print(f'Quantizing model into {n_bits}-bit weights')

    for name, param in model.named_parameters():
        if 'weight' in name:  # Quantize only the weights, not biases
            # Print layer being quantized
            #print(f'Quantizing layer: {name}')

            # Apply per-layer quantization (can use USquantize or per_layer_quantize function)
            q_sd[name] = UASquantize(sd[name], bits=n_bits)

    # Load the quantized weights into the model
    q_model.load_state_dict(q_sd)

    # Test the Model on the test set
    correct = 0
    total = 0
    for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))


Quantizing model into 4-bit weights
Accuracy on the 10000 test images:  90.13 %
Quantizing model into 8-bit weights
Accuracy on the 10000 test images:  90.33 %
Quantizing model into 12-bit weights
Accuracy on the 10000 test images:  90.34 %
Quantizing model into 16-bit weights
Accuracy on the 10000 test images:  90.32 %


# Per layer symmetric clipped

In [15]:
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()

# Weight quantization only for different bitwidths
for n_bits in (4, 8, 12, 16):
    print(f'Quantizing model into {n_bits}-bit weights')

    for name, param in model.named_parameters():
        if 'weight' in name:  # Quantize only the weights, not biases
            # Print layer being quantized
            #print(f'Quantizing layer: {name}')

            # Apply per-layer quantization (can use USquantize or per_layer_quantize function)
            q_sd[name] = UASquantize_clipped(sd[name], bits=n_bits)

    # Load the quantized weights into the model
    q_model.load_state_dict(q_sd)

    # Test the Model on the test set
    correct = 0
    total = 0
    for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))


Quantizing model into 4-bit weights
Accuracy on the 10000 test images:  90.31 %
Quantizing model into 8-bit weights
Accuracy on the 10000 test images:  90.19 %
Quantizing model into 12-bit weights
Accuracy on the 10000 test images:  90.22 %
Quantizing model into 16-bit weights
Accuracy on the 10000 test images:  90.22 %


# Per layer asymmetric cliiped

In [20]:
q_model = deepcopy(model)
sd = model.state_dict()
q_sd = q_model.state_dict()

# Weight quantization only for different bitwidths
for n_bits in (4, 8, 12, 16):
    print(f'Quantizing model into {n_bits}-bit weights')

    for name, param in model.named_parameters():
        if 'weight' in name:  # Quantize only the weights, not biases
            # Print layer being quantized
            #print(f'Quantizing layer: {name}')

            # Apply per-layer quantization (can use USquantize or per_layer_quantize function)
            q_sd[name] = USquantize(sd[name], bits=n_bits)

    # Load the quantized weights into the model
    q_model.load_state_dict(q_sd)

    # Test the Model on the test set
    correct = 0
    total = 0
    for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))


Quantizing model into 4-bit weights
Accuracy on the 10000 test images:  90.10 %
Quantizing model into 8-bit weights
Accuracy on the 10000 test images:  90.35 %
Quantizing model into 12-bit weights
Accuracy on the 10000 test images:  90.34 %
Quantizing model into 16-bit weights
Accuracy on the 10000 test images:  90.32 %


In [23]:
summary(q_model, (1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 28, 28]             144
              ReLU-2           [-1, 16, 28, 28]               0
         MaxPool2d-3           [-1, 16, 14, 14]               0
            Conv2d-4           [-1, 32, 14, 14]           4,608
              ReLU-5           [-1, 32, 14, 14]               0
         MaxPool2d-6             [-1, 32, 7, 7]               0
            Linear-7                   [-1, 10]          15,680
Total params: 20,432
Trainable params: 20,432
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.32
Params size (MB): 0.08
Estimated Total Size (MB): 0.40
----------------------------------------------------------------


# Activation quantization

In [18]:
class QuantizedModel(nn.Module):
    def __init__(self, original_model, bits=4):
        super(QuantizedModel, self).__init__()
        self.model = original_model
        self.bits = bits

        # Quantizing the weights of each layer
        self.quantized_weights = {}
        for name, param in self.model.named_parameters():
            if 'weight' in name:
                self.quantized_weights[name] = UASquantize(param, bits=self.bits)

    def forward(self, x):
        for name, layer in self.model.named_children():
            x = layer(x)
            if isinstance(layer, (nn.Conv2d, nn.Linear)):
                # Quantize activations after each Conv/FC layer
                x = UASquantize(x, bits=self.bits)
        return x

q_model = QuantizedModel(model, bits=4)

sd = model.state_dict()
q_sd = q_model.state_dict()

# weight quantization only
# different bitwidth
for n_bits in (4,8, 12, 16):
  print(f'quantizing model into {n_bits} bits')
  #q_model.load_state_dict(q_sd)
  q_model.bits = n_bits
  # Test the Model
  correct = 0
  total = 0
  for images, labels in test_loader:
      if cuda_available: images = images.cuda();labels = labels.cuda()
      outputs = q_model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  print('Accuracy on the 10000 test images: % .2f %%' % (100 * correct/ total))

quantizing model into 4 bits


RuntimeError: mat1 and mat2 shapes cannot be multiplied (7168x7 and 1568x10)