In [1]:
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import Amazon
import torch_geometric.transforms as T
#from torch_geometric.nn import GATConv, GATv2Conv
from decimal import Decimal
import os
import numpy as np
import time
#%load_ext autoreload
#%autoreload 2

import sys
sys.path.insert(1, '/home/xilinx/jupyter_notebooks/sgrace_lib')

import config
from sgrace import init_SGRACE,GATConv_SGRACE, Relu_SGRACE

torch.manual_seed(12345)

# the node degree is calculate in adj and if a row has a node degree of zero then the features of the node are set to zero.
# I thought that for deep quantization there will be more rows at zero but this is not the case. The normalization
# seems to make the adj values higher and then after quantization there are still not zero. This is problaby not a bad thing since 
# nodes that are initially connected and then remove will hurt accuracy. 
# In summary quantization reduces the number of connections for a node but nodes with just a single connection remain connected. 
norm_adj = 1 #use normalize adjacency
custom = 1
full_graph = 0

batch_value = 128 #not relevant in planetoid that is a single graph, relevant for Amazon
num_epochs = 200 

#gnn max size

init_SGRACE()



In [2]:
import math
transform = None
if (full_graph==1):
 #dataset_sel = "Pubmed"
 dataset_sel = "Cora"
 #dataset_sel = "Citeseer"
 dataset = Planetoid(root="data/Planetoid", name=dataset_sel, split="full", transform=transform) #split = "full"
else:
 #dataset_sel = 'Computers'
 dataset_sel = 'Photo'
 dataset = Amazon(root="data/Amazon", name=dataset_sel, transform=transform)





print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

print("average node degree")
average_node_degree = data.num_edges / data.num_nodes
print(average_node_degree)
print("Fill value")
print(math.log2(average_node_degree))




Dataset: AmazonPhoto():
Number of graphs: 1
Number of features: 745
Number of classes: 8

Data(x=[7650, 745], edge_index=[2, 238162], y=[7650])
Number of nodes: 7650
Number of edges: 238162
Average node degree: 31.13
Has isolated nodes: True
Has self-loops: False
Is undirected: True
average node degree
31.132287581699348
Fill value
4.960339683950122


In [3]:
if (full_graph==1):
 from torch_geometric.loader import DataLoader

 train_loader = DataLoader(dataset, batch_size=batch_value, shuffle=True)
 test_loader = DataLoader(dataset, batch_size=batch_value, shuffle=False)

else:

 from torch_geometric.loader import NeighborLoader

 data = dataset[0]
    
 #standard
 data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
 data.train_mask[:data.num_nodes - 1000] = 1

 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
 data.test_mask[data.num_nodes - 1000:data.num_nodes - 500] = 1


 train_loader = NeighborLoader(data, batch_size=batch_value, num_neighbors=[10] *1,input_nodes= data.train_mask,shuffle=False)
 test_loader = NeighborLoader(data, batch_size=batch_value, num_neighbors=[10] * 1,input_nodes= data.test_mask,shuffle=False)

   



In [4]:
from torch.nn import Linear
import torch.nn.functional as F
from torch.nn import LeakyReLU
from torch_geometric.nn import GATConv
from torch_geometric.nn import global_mean_pool
#from ogb.graphproppred.mol_encoder import AtomEncoder

class GAT(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GAT, self).__init__()
        #torch.manual_seed(12345)
        #self.emb = AtomEncoder(dataset.num_node_features)
        self.att = GATConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        #self.conv3 = GCNConv(hidden_channels, 16)
        self.lin = Linear(hidden_channels, dataset.num_classes)
        #self.lin2 = Linear(16, 16)
        #self.lin3 = Linear(16, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = x.float()
        #x = self.emb(x)
        rmult = time.time()
        
        #recorder = DataRecorder(rails['0V85'].power)
        #print('CPU forward kernel on')
        #with recorder.record(0.2): # Sample every 500 ms
        #  amult = time.time()
        #  for _ in range(10):
        x = self.att(x, edge_index)

        x = x.relu() 
        #dmult =  time.time()   
         #if (config.profiling == 1):
        #print(recorder.frame)
        #x = self.att(x, edge_index)
        #x = x.relu()        
        #lrelu = LeakyReLU(0.1)
        #x = lrelu(x)
        if (config.profiling == 1):
         print('conv1 layer timing : {:.5f}s'.format(time.time() - rmult))
        rmult = time.time()
        x = self.conv2(x, edge_index)
        if (config.profiling == 1):
         print('conv2 layer timing : {:.5f}s'.format(time.time() - rmult))
        
        #x = x.relu()
        #x = self.conv3(x, edge_index)
        
        # 2. Readout layer
        #x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
        x = x.relu()
        #x = lrelu(x)

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        #x = self.lin2(x)
        #x = self.lin3(x)
        
 
        #return F.log_softmax(x, dim=1)
        
        return x


model = GAT(hidden_channels=config.hidden_channels)
print(model)

GAT(
  (att): GATConv(745, 16, heads=1)
  (conv2): GATConv(16, 16, heads=1)
  (lin): Linear(in_features=16, out_features=8, bias=True)
)


In [5]:
from torch.nn import Linear
from torch.nn import LeakyReLU
import torch.nn.functional as F
import math
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_scatter import scatter_add
from torch_geometric.utils import add_remaining_self_loops,add_self_loops,sort_edge_index,degree


import pandas as pd

def sym_norm(edge_index, num_nodes, edge_weight=None, improved=False, dtype=None):
    if edge_weight is None:
        edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype, device=edge_index.device)

    fill_value = 1 if not improved else 2
    edge_index, edge_weight = add_remaining_self_loops(edge_index, edge_weight, fill_value, num_nodes)

    row, col = edge_index
    deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0

    return edge_index, deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]

def sym_norm2(edge_index, num_nodes, edge_weight=None, improved=False, dtype=None):
    if edge_weight is None:
        edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype, device=edge_index.device)

    
    # Calculate node degrees
    node_degrees = degree(edge_index[1], num_nodes=num_nodes)

    #print('max_degree')
    #print(torch.max(node_degrees))

  
    
    fill_value = math.trunc(math.log2(average_node_degree)) if not improved else 2
    
    #print("fill value")
    #print(fill_value)
    #fill_value = torch.max(node_degrees) if not improved else 2
    #fill_value = 1 if not improved else 2 #32

    
    #edge_weight = torch.zeros((edge_index.size(1), ), dtype=dtype, device=edge_index.device)
    
    #print("edge index")
    #print(edge_index)
    edge_index, edge_weight = add_remaining_self_loops(edge_index, edge_weight, fill_value, num_nodes)
    
    edge_index, edge_weight = sort_edge_index(edge_index, edge_weight) #make sure that self loops are in order
    
    row, col = edge_index
    deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    
    return edge_index, deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]



class GAT_PYNQ(torch.nn.Module):
    

    def __init__(self, hidden_channels,head_count):
        super(GAT_PYNQ, self).__init__()
        print("GAT_PYNQ INIT")
        #torch.manual_seed(12345)
        

        #self.att1 = GATConv(dataset.num_node_features, hidden_channels)
        self.att2 = GATConv_SGRACE(dataset.num_node_features, hidden_channels,head_count,dropout=0.1, alpha=0.2, concat=False)

        #self.conv21 = GATConv(hidden_channels, hidden_channels)
        self.conv22 = GATConv_SGRACE(hidden_channels*head_count, hidden_channels,1)
        
        self.reluh = Relu_SGRACE()
        
        self.lin = Linear(hidden_channels, dataset.num_classes)
        
   
        
    def forward(self, x, edge_index):
        if (config.profiling==1):
         ptime = time.time()
            
        if(config.profiling==1):
         vtime = time.time();  
        
       
         #print("Normalizing adjacency")
      #global adj
        #adj = to_dense_adj(edge_index, edge_attr=norm)
        #adj=torch.squeeze(adj)

        #quantize adj

         
        #global pynq_adj 
        #pynq_adj = adj._to_sparse_csr()
        

        
        edge_index, norm = sym_norm2(edge_index,x.size(0),improved=False)
        
        #global adj
        adj = torch.sparse_coo_tensor(edge_index, norm) 
        
        dense = 0
        relu = 1
        if (config.profiling==1):
         fmult = time.time()
        
        #if (config.acc_deep==0):
        # x = self.att1(relu,x,edge_index)
        #else:
        x = self.att2(config.compute_attention,dense,relu,x,edge_index,norm,adj)
        
        #print("out form first layer")
        #print(x)
        
        if (config.profiling == 1):
         print('L1 layer time: {:.5f}ms'.format(1000*(time.time() - fmult)))
        
        if (config.profiling==1):
         fmult = time.time()
        x = self.reluh(x) #enable this to unmerge relu and take into account that relu is done in hardware 
        dense = 1 #hardwware execution mode for layer 2. 1 => fea dense

        if(config.min_output==0):
         print("SECOND LAYER ON")

        ######dense X
        #xaux = x.detach().numpy()

        #if(config.hardware_quantize == 0):
        # support_xaux = quantization_uqbits(xaux,f_s2,f_z2,f_qbits) * (2**f_align)
        #else:
        # support_xaux = xaux
        
        #if(config.min_output==0):  
        # print("Second layer quantize features sparsity")
        # isSparse(support_xaux, support_xaux.shape[0],support_xaux.shape[1])
        #print(xaux)

        #print(support_xaux)
        #values_fea_buffer[0:(x.shape[0]*x.shape[1])] = (support_xaux.reshape(1,x.shape[0]*x.shape[1])) * (1<<f_align)
        #config.values_fea_buffer[0:(x.shape[0]*x.shape[1])] = (support_xaux.reshape(1,x.shape[0]*x.shape[1]))# * (2**f_align) #cuidado    
      
   
        relu = 0
    
        if (config.profiling == 1):
         print('Relu time: {:.5f}ms'.format(1000*(time.time() - fmult)))


        if (config.profiling == 1):
         fmult = time.time()
 
        #if (config.acc_deep==0):
        # x = self.conv21(x,edge_index)
        #else:
        x = self.conv22(config.compute_attention,dense,relu,x,edge_index,norm,adj)
        

        if (config.profiling == 1):
         print('L2 layer time: {:.5f}ms'.format(1000*(time.time() - fmult)))


        # 2. Readout layer
        if (config.profiling == 1):
         fmult = time.time()
        x = x.float()

        # 3. Apply a final classifier
  
        x = F.dropout(x, p=0.5, training=self.training)
        #print(x.shape)
        x = self.lin(x)
        


        if (config.profiling == 1):
         print('Readout time: {:.5f}ms'.format(1000*(time.time() - fmult)))
        #print(x)
        
        if (config.profiling == 1):
          print('Model time {:.5f}ms'.format(1000*(time.time() - ptime)))

        return x

model = GAT_PYNQ(config.hidden_channels,config.head_count)
print(model)

GAT_PYNQ INIT
GAT_PYNQ(
  (att2): GATConv_SGRACE (745 -> 16)
  (conv22): GATConv_SGRACE (16 -> 16)
  (reluh): Relu_SGRACE()
  (lin): Linear(in_features=16, out_features=8, bias=True)
)


In [None]:
#from IPython.display import Javascript
#display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))
from torch_geometric.utils.convert import to_scipy_sparse_matrix
from scipy import sparse
from torch_geometric.utils import to_dense_adj


if (custom==0):
  model = GAT(config.hidden_channels)
else:        
  model = GAT_PYNQ(config.hidden_channels,config.head_count)
  model_path = "models/model_Cora_8bit_gat.ptx"
  #model.load_state_dict(torch.load(model_path),strict=False)
    
  #Final results. The different learning rates are very important for different quantizations.  
  #for 8-4 bit 
  #optimizer = torch.optim.Adam(model.parameters(),  lr=0.005)
  #for 2-1 bit optimizer 
  #optimizer = torch.optim.Adam(model.parameters(),  lr=0.05)
  #optimizer = torch.optim.Adam(model.parameters(),  lr=0.1)
    
  if(config.w_qbits>2 or config.acc_deep==0):
    print("Using low learning rate")
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    #optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
  else:
    #for 2-1 bit optimizer
    print("Using high learning rate")
    #optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) #with load state use low reaning rate
    optimizer = torch.optim.Adam(model.parameters(),  lr=0.1) #GAT benefits from this ?
    #optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
       
    
  criterion = torch.nn.CrossEntropyLoss()
  #criterion = torch.nn.NLLLoss()

def zero_count(array, n): 
 counter = 0
 # Count number of zeros
 # in the matrix
 for i in range(0, n):
  if (array[i] == 0):
   counter = counter + 1
 print("total values ",n)
 print("zero values ",counter)
 return (counter > ((n) // 2))


def accuracy(x, labels, dataset_sel):

 x1 = np.equal(x, labels)
 x2 = np.sum(x1)

 if isinstance(x, list):
     acc = x2 / len(x)
 else:
     acc = x2 / x.size
 return acc


def train():
  model.train()
  for bid, batch in enumerate(train_loader):
       batchsize = batch.x.shape[0]
  #for data in train_loader:  # Iterate in batches over the training dataset.
       tmult = time.time()
       #global num_nodes_h
       #num_nodes_h = batchsize
       if (custom==0):
        #print("Running TRAIN with full precision")
        #out = model(data.x, data.edge_index)  # Perform a single forward pass. 
        out = model(batch.x, batch.edge_index)  # Perform a single forward pass. 
       else:
        #out = model(data.x, data.edge_index)
        out = model(batch.x, batch.edge_index)
        if (config.profiling == 1):
         print('Forward train time: {:.5f}s'.format(time.time() - tmult))
       #loss = criterion(out[data.train_mask], data.y[data.train_mask])
       loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])

       tmult = time.time()
       loss.backward()  # Derive gradients.
       if (config.profiling == 1):
        print('backward time: {:.5f}s'.format(time.time() - tmult))
       optimizer.step()  # Update parameters based on gradients.
       optimizer.zero_grad()  # Clear gradients.

def test(loader,split):
   model.eval()
     
   preds_l = []
   labels_l = []

   for bid, batch in enumerate(loader):  # Iterate in batches over the training/test dataset.
   #    batchsize = batch.x.shape[0]
   #    global num_nodes_h 
   #    num_nodes_h = batchsize
       #print("graph size is ", batch.x.shape[0]) 
   #for data in loader:  # Iterate in batches over the training/test dataset.
       if (config.profiling==1):
        tmult = time.time()
       if (custom==0):
        #out = model(data.x, data.edge_index) 
        out = model(batch.x, batch.edge_index) 
       #print("Test")
       else:
        #out = model(data.x, data.edge_index)
        out = model(batch.x, batch.edge_index) 
       #print(out)
       if (config.profiling == 1):
        print('Forward test time: {:.5f}s'.format(time.time() - tmult))
            
            
       if (split == "train"):
         preds_l.append(out[batch.train_mask].detach().numpy())
         labels_l.append(batch.y[batch.train_mask].detach().numpy())
       elif (split == "test"):
         preds_l.append(out[batch.test_mask].detach().numpy())
         labels_l.append(batch.y[batch.test_mask].detach().numpy()) 
       preds = np.argmax(np.concatenate(preds_l), axis=1)
        
       #if (split == "train"):
       # preds_l.append(out[data.train_mask].detach().numpy())
       # labels_l.append(data.y[data.train_mask].detach().numpy())
       #elif (split == "test"):
       # preds_l.append(out[data.test_mask].detach().numpy())
       # labels_l.append(data.y[data.test_mask].detach().numpy()) 
       #preds = np.argmax(np.concatenate(preds_l), axis=1)

    
   pred_acc = accuracy(preds, np.concatenate(labels_l), dataset_sel)
            
   return pred_acc  # Derive ratio of correct predictions.


print('Running inference only with train and test data sets')

for epoch in range(1):
    
   amult = time.time()
   test_acc = test(test_loader,"test") 
   train_acc = test(train_loader,"train") 
   print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Time: {(time.time() - amult):.4f}')

#quit()
#exit()
#raise SystemExit("Stop right there!")
    
print("Training")
    
best_acc = 0
best_epoch = 0


for epoch in range(num_epochs):
       
  amult = time.time()
    #print("Running TRAIN")
  train()
  #print("Running TEST")
  #train_acc = test(train_loader,"train") #remove to speed up
  #train_acc = 0
  test_acc = test(test_loader,"test") 
    
  if (test_acc > best_acc):
   best_acc = test_acc
   best_epoch = epoch

  print(f'Epoch: {epoch:03d}, Test Acc: {test_acc:.4f}, Time: {(time.time() - amult):.4f}')
   
print(' ')
  
print('Best accuracy: ', best_acc)
print('Best epoch: ', best_epoch)




GAT_PYNQ INIT
Using low learning rate
Running inference only with train and test data sets
Epoch: 000, Train Acc: 0.1084, Test Acc: 0.0961, Time: 3.3050
Training
Epoch: 000, Test Acc: 0.3142, Time: 4.7042
Epoch: 001, Test Acc: 0.4809, Time: 4.6638
Epoch: 002, Test Acc: 0.5540, Time: 4.6946
Epoch: 003, Test Acc: 0.5760, Time: 4.6868
Epoch: 004, Test Acc: 0.6307, Time: 4.6839
Epoch: 005, Test Acc: 0.6764, Time: 4.6645
Epoch: 006, Test Acc: 0.7252, Time: 4.6592
Epoch: 007, Test Acc: 0.7549, Time: 4.6688
Epoch: 008, Test Acc: 0.7617, Time: 4.6643
Epoch: 009, Test Acc: 0.7817, Time: 4.6684
Epoch: 010, Test Acc: 0.8239, Time: 4.6611
Epoch: 011, Test Acc: 0.8371, Time: 4.6586
Epoch: 012, Test Acc: 0.8521, Time: 4.6900
Epoch: 013, Test Acc: 0.8496, Time: 4.6755
Epoch: 014, Test Acc: 0.8510, Time: 4.6638
Epoch: 015, Test Acc: 0.8756, Time: 4.6810
Epoch: 016, Test Acc: 0.8669, Time: 4.6630
Epoch: 017, Test Acc: 0.8735, Time: 4.7836
Epoch: 018, Test Acc: 0.8729, Time: 4.6608
Epoch: 019, Test Acc:

In [None]:
print(best_acc)

In [None]:
from matplotlib import pyplot as plt
import pandas as pd

#fig, ax = plt.subplots()
plt.figure(figsize=(10, 10))
accuracy_gat = pd.read_csv('accuracies_gat_2a2f2w_cora.csv', sep=' ', header=None)
#accuracy_gat = pd.read_csv('accuracies_gcn_1a1f1w_cora.csv', sep=' ', header=None)
#accuracy_gcn = pd.read_csv('accuracy_gcn_1a1f1w.csv', sep=' ', header=None)
#accuracy_gat = pd.read_csv('accuracies_bits.csv', sep=',')
#print(accuracy_gat.head()) #Shows the top 5 rows of data
#print(accuracy_gat)
accuracy_gat = accuracy_gat.to_numpy()
accuracy_gat = accuracy_gat[0][0:num_epochs]
accuracy_gcn0 = accuracies_bits[0][0:num_epochs]
accuracy_gcn1 = accuracies_bits[1][0:num_epochs]
accuracy_gcn2 = accuracies_bits[2][0:num_epochs]
accuracy_gcn3 = accuracies_bits[3][0:num_epochs]
accuracy_gcn4 = accuracies_bits[4][0:num_epochs]
accuracy_gcn5 = accuracies_bits[5][0:num_epochs]
accuracy_gcn6 = accuracies_bits[6][0:num_epochs]
plt.plot(range(num_epochs), accuracy_gcn0, label='at0')
plt.plot(range(num_epochs), accuracy_gcn1, label='at1')
plt.plot(range(num_epochs), accuracy_gcn2, label='at2')
plt.plot(range(num_epochs), accuracy_gcn3, label='at3')
plt.plot(range(num_epochs), accuracy_gcn4, label='at4')
plt.plot(range(num_epochs), accuracy_gcn5, label='at5')
plt.plot(range(num_epochs), accuracy_gcn6, label='at6')
#plt.plot(range(num_epochs), accuracy_gcn, label='GCN')
#plt.plot(range(num_epochs), accuracy_gat, label='GAT')
#plt.plot(range(num_epochs), accuracy_gcn, label='GCN')
#plt.plot(range(num_epochs), accuracy_gat, label='GAT')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
#ax.set_title('Accuracy per epoch')
plt.show()


In [None]:
from matplotlib import pyplot as plt

print("Number of values in weight matrix")
print(dataset.num_features*16)
print(B_buffer[0:(dataset.num_features*16)]/2**f_align)

mybins = []
#for k in range((-2**(w_qbits)),(2**(w_qbits))):
for k in range(-128,128):
#for k in range(-8,7):
    mybins += [k]
#mybins = [-2, -1, 0, 1]
y = B_buffer[0:(dataset.num_features*16)]/2**f_align
#y = B_buffer[0:(dataset.num_features*16)]


print("Number of selected value")
print(np.count_nonzero(y == -8))

plt.figure(figsize=(10, 10))
plt.xlabel('Weights')
plt.ylabel('Frequency')
#counts, bins, bars = plt.hist(y, bins=256)
counts, bins, bars = plt.hist(y,mybins)
#plt.yticks(np.arange(0, 12000, step=500))
#plt.xticks(mybins,horizontalalignment='center',fontsize=12,rotation=90)
plt.xticks(horizontalalignment='center',fontsize=12,rotation=90)
plt.show()

print("MAX FEA INTERNAL VALUE ", cur_max_fea)
print("MAX ADJ OUT VALUE ", cur_max_adj)
print("MIN ADJ OUT VALUE ", cur_min_adj)
print("Use this to adjust your hardware ITYPE width")
#print(bins)
#print(counts)


#test inference
write_file = 0
print("Single TEST with ",w_qbits)
print("generating qbits w constants with bits: ",w_qbits)
#signed w
w_s_o,w_s,w_z=generate_quantization_qbits_constants(w_min, w_max,w_qbits)
#unsigned a and f
#print("generating qbits a constants")
a_s_o,a_s,a_z=generate_quantization_uqbits_constants(a_min, a_max,a_qbits)
#print("generating qbits f constants")
f_s_o,f_s,f_z=generate_quantization_uqbits_constants(f_min, f_max,f_qbits)
deq_o = w_s_o*f_s_o*a_s_o

#8 bit 
#my_ip.register_map.scale_fea = 0x00000200 #scale fea
#internal_quantization = 0x0000007F #8 bit (-127,127)
#deq_o=deq_o*pow(2, 23)
    

#internal_quantization = 0x7FFFFFFF #all bits
#internal_quantization = 0x01FFE000 #8 bit (-127,127)
#internal_quantization = 0x01FFE0000 #8 bit (-127,127)
#internal_quantization = 0x01C00000 #4 bit (-7,7)
#internal_quantization = 0x00100000 #2 bit (-1,1)
amult = time.time()
print("Running test")
test_acc = test(test_loader,"test")
#test_acc = test(test_loader) 
print(f'Test Acc: {test_acc:.4f}, Time: {(time.time() - amult):.4f}')
#np.set_printoptions(formatter={'int':hex})
#print(B_buffer[0:10] & (2**32-1))
#isSparse(B_buffer,1,(weights.shape[0]*weights.shape[1]))
#print(B_buffer[0:10] & (2**32-1))
#my_ip.register_map.max_fea_i = 0xFF
max_fea_done = my_ip.register_map.max_fea_ctrl
#while max_fea_done == 0:
# max_fea_done = my_ip.register_map.max_fea_ctrl
print("MAX FEA")
max_fea=my_ip.register_map.max_fea
print(max_fea)
#if (f_align==18):
# max_fea_short = (int(max_fea) >> 11) #this is the real max value with 8bit (f_align 18)
#else:
# max_fea_short = (int(max_fea) >> 19) #this is the real max value with 4bit (f_align 22)
#max_fea_short = (int(max_fea) >> 19) #this is the real max value with 4bit
#print("real max fea ",max_fea_short)
#print("Scaling factor to fit in 16 bits")
#print(0x7FFF/max_fea_short)
#print("Scaling factor to fit in 8 bits")
#print(0x7F/max_fea_short)
#print("Scaling factor to fit in 4 bits")
#print(0x7/max_fea_short)
#print("Scaling factor to fit in 2 bits")
#print(0x1/max_fea_short)
#print(f'Mask: {hex(frac_mask_w)} Stage: {stage:03d} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f},Test Acc: {test_acc:.4f}, Time: {(time.time() - amult):.4f}')
del my_ip
ol.free() 
bias_buffer.freebuffer()
profiling_buffer.freebuffer()
rowPtr_fea_buffer.freebuffer()
columnIndex_fea_buffer.freebuffer()
values_fea_buffer.freebuffer()
rowPtr_adj_buffer.freebuffer()
columnIndex_adj_buffer.freebuffer()
values_adj_buffer.freebuffer()
B_buffer.freebuffer()

## (Optional) Exercise

Can we do better than this?
As multiple papers pointed out ([Xu et al. (2018)](https://arxiv.org/abs/1810.00826), [Morris et al. (2018)](https://arxiv.org/abs/1810.02244)), applying **neighborhood normalization decreases the expressivity of GNNs in distinguishing certain graph structures**.
An alternative formulation ([Morris et al. (2018)](https://arxiv.org/abs/1810.02244)) omits neighborhood normalization completely and adds a simple skip-connection to the GNN layer in order to preserve central node information:

$$
\mathbf{x}_v^{(\ell+1)} = \mathbf{W}^{(\ell + 1)}_1 \mathbf{x}_v^{(\ell)} + \mathbf{W}^{(\ell + 1)}_2 \sum_{w \in \mathcal{N}(v)} \mathbf{x}_w^{(\ell)}
$$

This layer is implemented under the name [`GraphConv`](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GraphConv) in PyTorch Geometric.

As an exercise, you are invited to complete the following code to the extent that it makes use of PyG's `GraphConv` rather than `GCNConv`.
This should bring you close to **82% test accuracy**.

from torch_geometric.nn import GraphConv


class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = ...  # TODO
        self.conv2 = ...  # TODO
        self.conv3 = ...  # TODO
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        x = global_mean_pool(x, batch)

        
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GNN(hidden_channels=64)
print(model)

from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GNN(hidden_channels=64)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 201):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

## Conclusion

In this chapter, you have learned how to apply GNNs to the task of graph classification.
You have learned how graphs can be batched together for better GPU utilization, and how to apply readout layers for obtaining graph embeddings rather than node embeddings.

In the next session, you will learn how you can utilize PyTorch Geometric to let Graph Neural Networks scale to single large graphs.

[Next: Scaling Graph Neural Networks](https://colab.research.google.com/drive/1XAjcjRHrSR_ypCk_feIWFbcBKyT4Lirs)