In [5]:
from utilities import Utilities
from models import *
from data_loader import testloader, trainloader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model = VGG16_quant()
model_name = "VGG16_quant"
Utilities.test_model(model, model_name, testloader, device)


device =  cuda
Test: [0/79]	Time 0.538 (0.538)	Loss 0.1848 (0.1848)	Prec 94.531% (94.531%)
 * Prec 90.130% 


In [None]:
from utilities import Utilities
weight_decay = 1e-4
epochs = 20
lr = 2e-4
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
pre_best_prec = Utilities.train_model(model, model_name, optimizer, trainloader, criterion, epochs, testloader, pre_best_prec=pre_best_prec, reg_alpha=None)

In [4]:
class SaveOutput:
    def __init__(self):
        self.outputs = []
    def __call__(self, module, module_in):
        self.outputs.append(module_in)
    def clear(self):
        self.outputs = []  
        
######### Save inputs from selected layer ##########
save_output = SaveOutput()
i = 0

for layer in model.modules():
    i = i+1
    if isinstance(layer, QuantConv2d):
        print(i,"-th layer prehooked")
        layer.register_forward_pre_hook(save_output)             
####################################################

dataiter = iter(testloader)
images, labels = next(dataiter)
images = images.to(device)
out = model(images)

3 -th layer prehooked
7 -th layer prehooked
12 -th layer prehooked
16 -th layer prehooked
21 -th layer prehooked
25 -th layer prehooked
29 -th layer prehooked
34 -th layer prehooked
38 -th layer prehooked
42 -th layer prehooked
47 -th layer prehooked
51 -th layer prehooked
55 -th layer prehooked


In [2]:
weight_q = model.features[3].weight_q
w_alpha = model.features[3].weight_quant.wgt_alpha
w_bit = 4

weight_int = weight_q / (w_alpha / (2**(w_bit-1)-1))
# print(weight_int)

In [6]:
act = save_output.outputs[1][0]
act_alpha  = model.features[3].act_alpha
act_bit = 4
act_quant_fn = act_quantization(act_bit)

act_q = act_quant_fn(act, act_alpha)

act_int = act_q / (act_alpha / (2**act_bit-1))
# print(act_int)

In [7]:


conv_int = torch.nn.Conv2d(in_channels = 64, out_channels=64, kernel_size = 3, padding=1)
conv_int.weight = torch.nn.parameter.Parameter(weight_int)
conv_int.bias = model.features[3].bias
output_int = conv_int(act_int)
output_recovered = output_int * (act_alpha / (2**act_bit-1)) * (w_alpha / (2**(w_bit-1)-1))
# print(output_recovered)

In [8]:


conv_ref = torch.nn.Conv2d(in_channels = 64, out_channels=64, kernel_size = 3, padding=1)
conv_ref.weight = model.features[3].weight_q
conv_ref.bias = model.features[3].bias
output_ref = conv_ref(act)
#print(output_ref)

# print(abs((output_ref - output_recovered)).mean())

In [10]:


# act_int.size = torch.Size([128, 64, 32, 32])  <- batch_size, input_ch, ni, nj
a_int = act_int[0,:,:,:]  # pick only one input out of batch
# a_int.size() = [64, 32, 32]

# conv_int.weight.size() = torch.Size([64, 64, 3, 3])  <- output_ch, input_ch, ki, kj
w_int = torch.reshape(weight_int, (weight_int.size(0), weight_int.size(1), -1))  # merge ki, kj index to kij
# w_int.weight.size() = torch.Size([64, 64, 9])
                      
padding = 1
stride = 1
array_size = 8 # row and column number

nig = range(a_int.size(1))  ## ni group
njg = range(a_int.size(2))  ## nj group

icg = range(int(w_int.size(1)))  ## input channel 
ocg = range(int(w_int.size(0)))  ## output channel

ic_tileg = range(int(len(icg)/array_size))
oc_tileg = range(int(len(ocg)/array_size))

kijg = range(w_int.size(2))
ki_dim = int(math.sqrt(w_int.size(2)))  ## Kernel's 1 dim size

######## Padding before Convolution #######
a_pad = torch.zeros(len(icg), len(nig)+padding*2, len(nig)+padding*2).cuda()
# a_pad.size() = [64, 32+2pad, 32+2pad]
a_pad[ :, padding:padding+len(nig), padding:padding+len(njg)] = a_int.cuda()
a_pad = torch.reshape(a_pad, (a_pad.size(0), -1))
# a_pad.size() = [64, (32+2pad)*(32+2pad)]


a_tile = torch.zeros(len(ic_tileg), array_size,    a_pad.size(1)).cuda() 
w_tile = torch.zeros(len(oc_tileg)*len(ic_tileg), array_size, array_size, len(kijg)).cuda() 
with torch.no_grad():
    for ic_tile in ic_tileg:
        a_tile[ic_tile,:,:] = a_pad[ic_tile*array_size:(ic_tile+1)*array_size,:]
with torch.no_grad():
    for ic_tile in ic_tileg:
        for oc_tile in oc_tileg:
            w_tile[oc_tile*len(oc_tileg) + ic_tile,:,:,:] = w_int[oc_tile*array_size:(oc_tile+1)*array_size, ic_tile*array_size:(ic_tile+1)*array_size, :]



###########################################

p_nijg = range(a_pad.size(1)) ## psum nij group

psum = torch.zeros(len(ic_tileg), len(oc_tileg), array_size, len(p_nijg), len(kijg)).cuda() 
with torch.no_grad():
    for kij in kijg:
        for ic_tile in ic_tileg:       # Tiling into array_sizeXarray_size array
            for oc_tile in oc_tileg:   # Tiling into array_sizeXarray_size array        
                for nij in p_nijg:       # time domain, sequentially given input
                        m = nn.Linear(array_size, array_size, bias=False)
                        #m.weight = torch.nn.Parameter(w_int[oc_tile*array_size:(oc_tile+1)*array_size, ic_tile*array_size:(ic_tile+1)*array_size, kij])
                        m.weight = torch.nn.Parameter(w_tile[len(oc_tileg)*oc_tile+ic_tile,:,:,kij])
                        psum[ic_tile, oc_tile, :, nij, kij] = m(a_tile[ic_tile,:,nij]).cuda()
 

In [17]:
m = nn.Linear(array_size, array_size, bias=False)
w_tile[0,:,:,0]

tensor([[ 2.0000,  2.0000, 15.0000,  0.0000, 14.0000, 13.0000, 12.0000, 15.0000],
        [ 3.0000, 13.0000, 14.0000,  1.0000,  3.0000, -0.0000,  0.0000,  5.0000],
        [15.0000, 14.0000,  0.0000, 15.0000, -0.0000, 15.0000, 14.0000, 14.0000],
        [ 0.0000,  0.0000, 15.0000, 14.0000,  2.0000,  2.0000,  1.0000,  2.0000],
        [ 2.0000,  4.0000, 14.0000, -0.0000,  1.0000,  1.0000,  0.0000,  1.0000],
        [ 1.0000,  4.0000,  0.0000, 15.0000, 15.0000,  2.0000,  1.0000, 14.0000],
        [ 0.0000,  1.0000, 14.0000, 14.0000,  2.0000, 11.0000, -0.0000, 13.0000],
        [ 3.0000,  2.0000, 12.0000,  1.0000, 15.0000, 11.0000,  1.0000, 14.0000]],
       device='cuda:0')

In [18]:
a_tile[0,:,200]

tensor([1.0000, 0.0000, 4.0000, 0.0000, 0.0000, 1.0000, 3.0000, 4.0000],
       device='cuda:0')

In [None]:
import math

a_pad_ni_dim = int(math.sqrt(a_pad.size(1))) # 32

o_ni_dim = int((a_pad_ni_dim - (ki_dim- 1) - 1)/stride + 1)
o_nijg = range(o_ni_dim**2)    
    
out = torch.zeros(len(ocg), len(o_nijg)).cuda()
  
   
### SFP accumulation ###
with torch.no_grad():
    for o_nij in o_nijg: 
        for kij in kijg:
            for ic_tile in ic_tileg:    
                for oc_tile in oc_tileg:   
                    out[oc_tile*array_size:(oc_tile+1)*array_size, o_nij] = out[oc_tile*array_size:(oc_tile+1)*array_size, o_nij] + \
                    psum[ic_tile, oc_tile, :, int(o_nij/o_ni_dim)*a_pad_ni_dim + o_nij%o_ni_dim + int(kij/ki_dim)*a_pad_ni_dim + kij%ki_dim, kij]
                ## 4th index = (int(o_nij/30)*32 + o_nij%30) + (int(kij/3)*32 + kij%3)

In [None]:
out_2D = torch.reshape(out, (out.size(0), o_ni_dim, -1))
difference = (out_2D - output_int[0,:,:,:])
# print(difference.sum())

In [11]:
### show this cell partially. The following cells should be printed by students ###
tile_id = 0 
nij = 200 # just a random number
X = a_tile[tile_id,:,nij:nij+64]  # [tile_num, array row num, time_steps]

bit_precision = 4
file = open('activation.txt', 'w') #write to file
file.write('#time0row7[msb-lsb],time0row6[msb-lst],....,time0row0[msb-lst]#\n')
file.write('#time1row7[msb-lsb],time1row6[msb-lst],....,time1row0[msb-lst]#\n')
file.write('#................#\n')

for i in range(X.size(1)):  # time step
    for j in range(X.size(0)): # row #
        X_bin = '{0:04b}'.format(round(X[7-j,i].item()))
        for k in range(bit_precision):
            file.write(X_bin[k])        
        #file.write(' ')  # for visibility with blank between words, you can use
    file.write('\n')
file.close() #close file    


In [None]:
X.shape

In [14]:
### Complete this cell ###
tile_id = 0 
kij = 0
W = w_tile[tile_id,:,:,kij]  # w_tile[tile_num, array col num, array row num, kij]


bit_precision = 4
file = open('weight.txt', 'w') #write to file
file.write('#col0row7[msb-lsb],col0row6[msb-lst],....,col0row0[msb-lst]#\n')
file.write('#col1row7[msb-lsb],col1row6[msb-lst],....,col1row0[msb-lst]#\n')
file.write('#................#\n')
for col in range(W.size(0)):
    for row in range(W.size(1)-1, -1, -1):
        weight = W[col,row]
        if weight < 0:
            weight += 16
        file.write(int_to_fixed_binary_alt(int(weight), 4))
    file.write('\n')
file.close()
        

...

Ellipsis

In [13]:
def int_to_fixed_binary_alt(number, width):
    binary_representation = bin(number)[2:]  # Remove the '0b' prefix
    return binary_representation.zfill(width)

int_to_fixed_binary_alt(7, 4)

'0111'

In [15]:
W[0,:] # check this number with your 2nd line in weight.txt

tensor([ 2.,  2., 15.,  0., 14., 13., 12., 15.], device='cuda:0')

In [16]:
### Complete this cell ###
ic_tile_id = 0 
oc_tile_id = 0 


kij = 0
nij = 200
psum_tile = psum[ic_tile_id,oc_tile_id,:,nij:nij+64,kij]  

# psum[len(ic_tileg), len(oc_tileg), array_size, len(p_nijg), len(kijg)]

        
bit_precision = 16
file = open('psum.txt', 'w') #write to file
file.write('#time0col7[msb-lsb],time0col6[msb-lst],....,time0col0[msb-lst]#\n')
file.write('#time1col7[msb-lsb],time1col6[msb-lst],....,time1col0[msb-lst]#\n')
file.write('#................#\n')

neg_offset = 2**bit_precision

for time in range(psum_tile.size(1)):
    for col in range(psum_tile.size(0) - 1, -1, -1):
        curr_psum = psum_tile[col,time]
        if curr_psum < 0:
            curr_psum += neg_offset
        file.write(int_to_fixed_binary_alt(int(curr_psum), bit_precision))
    file.write('\n')
file.close()        
...

Ellipsis

In [None]:
psum_tile.shape