In [17]:
import torch
import torch.nn as nn
from torch.autograd import Variable, Function
import torch.nn.functional as F

from torchvision import datasets, transforms
import numpy as np

batch_size = 128
n_epochs = 1000
validation_steps = 10
learning_rate = 5e-3
stochastic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [24]:
x = np.random.randn(16)
layer = nn.Linear(16, 32).to(device)
x = torch.FloatTensor(x).to(device)

In [115]:
def roundpass(x):
    yOut = torch.round(x)
    yGrad = x
    y = torch.detach(yOut - yGrad) + yGrad
    return y

class QuantizedLinear(nn.Module):
    def __init__(self, original_linear, bitwidth=16):
        super(QuantizedLinear, self).__init__()
        self.weight = original_linear.weight
        self.bias = original_linear.bias
        self.Qn = - (2 ** (bitwidth - 1))
        self.Qp = 2 ** (bitwidth - 1) - 1
        
        # Initialize weights
        detached_weights = ql.weight.data.cpu().numpy()
        step = np.array((2 * np.mean(np.abs(detached_weights))) / np.sqrt(self.Qp))
        step = torch.from_numpy(step)
        self.register_buffer('step_size', step)
        self.step_size.requires_grad = True
        
    def forward(self, inputs):
        quantized_weights = self.weight / self.step_size
        quantized_weights = torch.clamp(quantized_weights, self.Qn, self.Qp)
        quantized_weights = roundpass(quantized_weights)
        output = F.linear(inputs, quantized_weights, self.bias)
        return output

In [116]:
ql = QuantizedLinear(layer).to(device)

In [120]:
output = ql.forward(x)
loss = output.sum()
loss.backward()

In [127]:
ql.step_size.requires_grad

True

In [126]:
ql.step_size.grad

In [124]:
ql.weight.grad

tensor([[-11446.0273,   3006.7192,   3917.5645,    129.7033,  -1610.7875,
           2397.1521,  -2264.2502,   4144.9526,   4077.1482,  -2125.9324,
          -1056.8867,  11465.2686,   1488.8735,  -3967.8799,   5313.6401,
           3208.3643],
        [-11446.0273,   3006.7192,   3917.5645,    129.7033,  -1610.7875,
           2397.1521,  -2264.2502,   4144.9526,   4077.1482,  -2125.9324,
          -1056.8867,  11465.2686,   1488.8735,  -3967.8799,   5313.6401,
           3208.3643],
        [-11446.0273,   3006.7192,   3917.5645,    129.7033,  -1610.7875,
           2397.1521,  -2264.2502,   4144.9526,   4077.1482,  -2125.9324,
          -1056.8867,  11465.2686,   1488.8735,  -3967.8799,   5313.6401,
           3208.3643],
        [-11446.0273,   3006.7192,   3917.5645,    129.7033,  -1610.7875,
           2397.1521,  -2264.2502,   4144.9526,   4077.1482,  -2125.9324,
          -1056.8867,  11465.2686,   1488.8735,  -3967.8799,   5313.6401,
           3208.3643],
        [-11446.0273

In [113]:
output.backward()

RuntimeError: grad can be implicitly created only for scalar outputs

In [79]:
output.backward()

AttributeError: 'NoneType' object has no attribute 'backward'

AttributeError: 'NoneType' object has no attribute 'backward'

In [21]:
np.mean(np.abs(detached_weights))

NameError: name 'detached_weights' is not defined

TypeError: new(): data must be a sequence (got numpy.float64)

32767

array([[-0.23882818,  0.13489422,  0.15865955,  0.19368696,  0.18919614,
         0.23255193,  0.18171448,  0.09192187, -0.22875121, -0.06435812,
         0.06657663,  0.14900512,  0.07796848,  0.15100685, -0.24461892,
         0.0107131 ],
       [ 0.02207163, -0.15304941,  0.07893223, -0.16988519,  0.00978947,
         0.15209594,  0.07984811,  0.22528166,  0.24292108,  0.09890577,
         0.10650149, -0.23040971, -0.06547421,  0.07239476,  0.10172284,
        -0.01587471],
       [ 0.18991166, -0.24439281,  0.11326149, -0.14285457, -0.09417358,
         0.14739051,  0.23646489,  0.2156525 , -0.23469839, -0.00440988,
         0.08157218, -0.01371172,  0.02116072, -0.11603403, -0.1272282 ,
        -0.02461731],
       [ 0.21519399, -0.22859225, -0.14644727,  0.04571882, -0.01975226,
         0.2181378 ,  0.09051257,  0.12682173, -0.23262504, -0.04794139,
         0.03930071,  0.19086707,  0.11573836,  0.2486201 , -0.15298477,
        -0.21225253],
       [-0.02800304,  0.2136139 , -0

In [5]:
# bl = BinarizedLinear(28 * 28, 32, stochastic=stochastic).to(device)
# model = dumdums(image_size=28, stochastic=stochastic).to(device)
# loss_function = L2SVMLoss().to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)