In [1]:
import torch
import random
from collections import deque
from torch.nn.functional import relu, prelu

In [2]:
#todo: collect all incoming edges for coalescence (search how they do it so it's consistent)
#todo: after implementing message pass algorithm, move it to forward pass (if possible... and if it doesn't overcomplicate modularly changing out attention structures...)
#todo: try graph step passing new node and computational dependency subtrees--> trim subtrees and extend undiscovered. intuition: like a net with a ball (at new node), 
# cut off at k-hop depth

# GraphSearch for single hop message passing
### new approach: graph class with nodes with child and parent links (for step and computational dependency subtree creation)
###               step() can be a random (for now uniform random) selection from children. 
#### todo: implement k-hop dependency subtree creation and think about computational efficiency... Don't want to have exponential search overhead with hops...

In [170]:
#idea: could it be possible to completely decouple the procedurality of the network and just assume it converges to something useful? 
#... Just randomly (as a function of local in/out degree) sample a layer from the network each time and just keep walking... 
#... Maybe this still allows for multi-granularity analysis, just less organized. This idea relies on some notion of convergence I think. 
#todo: inverse relations are added to the predicate set...
#todo: make training script
#todo: can enhance expressiveness by making transformation & aggregation steps 3 layer mlp's each. (in the case of the r-gcn these are the embedding layers)
# other option: can add mlp layers before and after the gnn, as pre-processing layers.
# skip connections can be used to reduce oversmoothing I.e. k-hop 3 gets preprocessed and passed past k-hop 2 as well as into it (duplication) and just gets summed together with k-hop2 outputs
# note that R-GCN uses normalized sum aggregation Also has edge dropout before batch norm
# R-GCN uses full batch adam (rmsprop+momentum) for 400 epochs
# R-GCN activation = relu, but relu-->relu-->softmax for entity classification makes sense ofcourse.
# can kind of see embeddings as eigenvectors of the implications of the structure of the graph under random walk
# check spectral node representation... its equal to the svd
# graph laplacian  adj matrix (alternative repr for adj matrix)--> decomp
# inverse relations and equality relations 

import torch
from torch.nn.functional import prelu, relu
import torch.nn as nn
from collections import defaultdict, deque
import random
from torch.nn.functional import one_hot

EMBEDDING_SIZE = 5
DIM_W = 5
MAX_K_HOP = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# def message():

#end to end... some choice for type(input) in inputs module dict(input)(input)

class NodeClassifier(nn.Module):
    def __init__(self, x_dim, num_classes):
        super(NodeClassifier, self).__init__()
        mlp_layers = []
        for _ in range(2):
            mlp_layers.append(nn.Linear(x_dim, x_dim))
            mlp_layers.append(nn.PReLU())
            mlp_layers.append(nn.Dropout(p=0.2))
            mlp_layers.append(nn.LayerNorm(x_dim))
        self.mlp = nn.Sequential(*mlp_layers)
        self.linear = nn.Linear(x_dim, num_classes)
        # self.softmax = nn.Softmax(num_classes)

    def forward(self, x):
        mlp = self.mlp(x)
        return self.linear(mlp)
        

class R_GCNLayer(nn.Module):
    #todo: handle size 0 batches
    def __init__(self, x_dim ,y_dim, unique_labels, num_mlp_layers=1):
        super(R_GCNLayer, self).__init__()
        self.edge_label_weights = nn.ModuleDict({label: nn.Linear(x_dim, y_dim) for label in unique_labels})
        self.y_dim = y_dim
        self.x_dim = x_dim
        self.unique_labels = list(unique_labels)
        mlp_layers = []
        # for _ in range(1):
        #     mlp_layers.append(nn.Linear(y_dim, y_dim))
        #     mlp_layers.append(nn.PReLU())
        #     mlp_layers.append(nn.Dropout(p=0.2))
        #     mlp_layers.append(nn.LayerNorm(y_dim))
        self.lin = nn.Linear(x_dim, y_dim)
        self.act = nn.ReLU()
        self.do = nn.Dropout(p=0.2)
        self.ln = nn.LayerNorm(y_dim)
        # self.mlp = nn.Sequential(*mlp_layers)

    def forward(self, layer_node_batch, agg_method=torch.sum):
            device = next(self.parameters()).device
            non_linear = []

            for i, node in enumerate(layer_node_batch):
                messages = node.collect_neighbours()
                transformed_messages = defaultdict(list)

                if messages:
                    for message in messages: #< this is a bit slow probably... don't know how to change it yet because of the different edge weights.
                        for parent_id, (embedding, edge_label, receiver_node_id) in message.items():
                            transformed = self.edge_label_weights[edge_label](embedding)
                            transformed_messages[i].append(transformed)

                    aggregated = agg_method(torch.stack(transformed_messages[i])) + self.mlp(node.embedding)
                    non_linear_i = self.prelu(aggregated)
                else:
                    transformed = torch.zeros(self.y_dim, dtype=torch.float32, device=device)
                    aggregated = self.mlp(node.embedding)
                    non_linear_i = self.prelu(aggregated)

                non_linear.append(non_linear_i)
            return torch.stack(non_linear)

    def forward(self, layer_node_batch, agg_method=torch.sum):
        #vectorized a bit further...
        device = next(self.parameters()).device
        outputs = []

        for i, node in enumerate(layer_node_batch):
            message_tensors, all_labels = node.collect_neighbour_tensors()
            
            if message_tensors:
                aggregated_all = torch.zeros((1, self.x_dim),dtype = torch.float32).to(device)
                for label in all_labels:
                    message_tensors[label] = torch.stack(message_tensors[label])

                    transformed = self.edge_label_weights[label](message_tensors[label])
                    activated = self.act(transformed)
                    
                    aggregated = agg_method(activated, dim=0)
                    aggregated = aggregated.unsqueeze(0)
                    aggregated_all = torch.cat((aggregated_all,aggregated),dim=0)
                                               
                personal_message = self.lin(node.embedding)
                personal_message_activated = self.act(personal_message)
                aggregated_all = agg_method(aggregated_all,dim=0)

                aggregated_all +=  personal_message
                dropout = self.do(aggregated_all)
                normalized = self.ln(dropout)

            else:
                transformed = torch.zeros(self.y_dim, dtype=torch.float32, device=device)
                aggregated = self.act(self.lin(node.embedding))
                dropout = self.do(aggregated)
                normalized = self.ln(dropout)
                

            outputs.append(normalized)
        return torch.stack(outputs)



class R_GCN(nn.Module): #change to include GCN layers and assign them batches (split them up in forward and handle them sequentially including node update)
    #it's a dependency bottleneck
    def __init__(self, x_dim, y_dim, graph, max_k_hop):
        super(R_GCN, self).__init__()
        self.graph = graph
        self.max_k_hop =max_k_hop
        self.x_dim = x_dim
        self.layers = nn.ModuleList([
            R_GCNLayer(x_dim, y_dim, graph.unique_labels) 
            for _ in range(max_k_hop + 1)
        ])
        
    
    def forward(self, node_batch, agg_method=torch.sum): #maybe handle k_hops outside? have it be a single layer...
        mini_batch = defaultdict(list)
        target_node = node_batch[0][0]
        
        for node, k in node_batch:
            layer = k
            mini_batch[layer].append(node)
            
        updated_embeddings = {}
        for layer in reversed(list(mini_batch.keys())):
            batch = mini_batch[layer]
            batch_size = len(batch)
            # print(f'layer {layer} processing {batch_size} items...')
            embeddings = self.layers[layer](batch)
            for i, node in enumerate(batch):
                try:
                    updated_embeddings[node.id] = embeddings[i]
                except:
                    pass
            for node in mini_batch[layer]:
                node.embedding = updated_embeddings[node.id]
        return target_node.embedding, torch.stack([updated_embeddings[node.id] for node,k in node_batch], dim=0)


class Node:
    def __init__(self, id_):
        self.id = id_
        self.embedding = torch.rand(EMBEDDING_SIZE, dtype=torch.float32, requires_grad=True, device=device)
        self.features = {}
        self.parents = {}
        self.children = {}
        self.out_edges = []


    def collect_neighbours(self):
        local_tree = []
        for parent, labels in self.parents.items():
            for label in labels:
                local_tree.append({parent.id: (parent.embedding, label, self.id)})
        return local_tree

    def collect_neighbour_tensors(self):
        embedding_dict = defaultdict(tuple)
        unique_labels_seen = set()
        for parent, labels in self.parents.items():
            for label in labels:
                embedding_dict[label] += (parent.embedding,)
                unique_labels_seen.add(label)
        return embedding_dict, list(unique_labels_seen)
                
                
                
    
    def print_out_degree(self):
        out_edges = [(label,target.id) for label,target in self.out_edges]
        print(f"Out edges: {out_edges}") if out_edges else print(f'Node {self.id} has no children')

    def adjust_embeddings(self, new_embedding):
        self.embedding = new_embedding

    def add_parent(self, parent_node, label):
        if parent_node not in self.parents:
            self.parents[parent_node] = []
        self.parents[parent_node].append(label)

    def add_child(self, child_node, label):
        if child_node not in self.children:
            self.children[child_node] = []
        self.children[child_node].append(label)
        self.add_out_edge(label, child_node)

    def add_out_edge(self, label, target):
        self.out_edges.append((label, target))

    def step(self):
        if not self.children:
            return None
        edge_label, target_node = random.choice(self.out_edges)
        return (target_node, edge_label)

class Graph:
    def __init__(self):
        self.nodes = {}
        self.label_weights = {}
        self.unique_labels = set()
        self.inverse_labels = True

    def add_node(self, id_, label):
        if id_ not in self.nodes:
            self.nodes[id_] = Node(id_)
            self.nodes[id_].label = label

    def add_edge(self, from_id, to_id, label):
        #todo: switch to tensors... requires some different logic.
        if from_id not in self.nodes:
            self.add_node(from_id)
        if to_id not in self.nodes:
            self.add_node(to_id)
            
        
        from_node = self.nodes[from_id]
        to_node = self.nodes[to_id]
        from_node.add_child(to_node, label)
        to_node.add_parent(from_node, label)
        self.unique_labels.add(label)
        if self.inverse_labels: #learnable inverse predicate relations from the R-GCN paper.
            inverse_label = label + ':inv:'
            from_node.add_child(to_node, inverse_label)
            to_node.add_parent(from_node, inverse_label)
            self.unique_labels.add(inverse_label)

    def initialize_label_weights(self):
        for label in list(self.unique_labels):
            self.label_weights[label] = torch.rand(DIM_W,dtype=torch.float32, requires_grad=True, device=device)
    
    def get_parents(self, id_):
        node = self.nodes.get(id_)
        return {parent.id: labels for parent, labels in node.parents.items()}

    def get_children(self, id_):
        node = self.nodes.get(id_)
        return {child.id: labels for child, labels in node.children.items()}

# pipeline: bfs_dep_tree --> loop (embedding = forward(collect_neighbours(bfs_dep_tree.get.pop()))  )
# where the first item in bfs_dep_tree list is the node and the second is the gcn layer to be used.
# collect neighbours also collects the edges for edge weights...
#todo: create inverse relations for all relations in the graph (for example by annotating them with '-' and using - weights for them. 
#set rdfs:a weights to identity matrix??

#idea save a queue snapshot when depth k-1 switches to depth k to pass with step() to next node.... i don't know how to find the edges after that...
def bfs_dep_tree(start_node, max_depth):
    queue = deque([(start_node, 0)])
    dep_tree = []
    visited = set()
    
    
    while queue:
        current_node, depth = queue.popleft()
        if depth > max_depth:
            break
        # if depth == max_depth-1:
        #     save_visited = visited
        visited.add(current_node)
        dep_tree.append((current_node, max(1,depth)))
        for parent in current_node.parents:
            if parent not in visited:
                queue.append((parent, depth + 1))
    return dep_tree, start_node.label#, save_visited

def bfs_dep_tree_computation_skip(start_tree, start_node, start_edge, max_depth): 
    #same as bfs_dep_tree, but skipping redundant computation.
    #intuition: like taking the dependency tree of the last node which is connected 
    #           and trimming the leaves and adding it to the next node, allowing us to skip
    #           computation for the edge we just came from.
    skip_edge = start_edge
    queue = deque([(start_node,0)])
    visited = set()
    dep_tree = []

    while queue:
        current_node, depth =  queue.popleft()
        if depth > max_depth-1:
            save_visited = visited
        if depth > max_depth:
            break
        visited.add(current_node)
        if current_node.id in skip_edge: #logic incomplete... now it skips some edges to the node it came from???? im not sure about this...
            continue
        dep_tree.append((current_node, max(1,depth)))
        for parent in current_node.parents:
            if parent not in visited:
                queue.append((parent, depth + 1))
    return dep_tree, save_visited

            
    

In [69]:
a = torch.rand(5)
b = torch.rand(5)
torch.stack((a,b)

tensor([[0.1974, 0.3270, 0.4556, 0.0978, 0.9606],
        [0.7984, 0.7639, 0.9763, 0.3604, 0.0515]])

In [7]:
torch.eye(NUM_CLASSES, dtype=torch.long, device=device)[random.choice(unique_classes)]

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

## Test some stuff:

In [35]:
graph=Graph()
nodes=[]
num_nodes = 100
NUM_CLASSES = 15
unique_classes = range(NUM_CLASSES)
for n in range(0, num_nodes):
    # graph.add_node(n, torch.eye(NUM_CLASSES, dtype=torch.torch.float32, device=device)[random.choice(unique_classes)])
    graph.add_node(n, random.choice(unique_classes))
    nodes.append(n)

preds = ["geometry:triangle","rdfs:subClassOf","FOAF:likes","rdfs:subClassOf","rdfs:domain","FOAF:knows","rdfs:isDefinedBy"]
for e in range(400):
    obj = random.choice(nodes)
    subj = random.choice([node for node in nodes if node != obj])
    pred = random.choice(preds)
    graph.add_edge(obj,subj,pred)
graph.initialize_label_weights()

In [5]:
# A = graph.nodes[0]
# node = A
# k_hops = 3

# dep_tree = bfs_dep_tree(node, k_hops)
# # dep_tree[-1][0].parents
# # set([dep[0].id for dep in dep_tree])
# print(dep_tree)

In [91]:
graph.initialize_label_weights()
target_node = graph.nodes[0]
k_hops = 5
r_gcn = R_GCN(x_dim=EMBEDDING_SIZE, y_dim=EMBEDDING_SIZE, graph=graph, max_k_hop=k_hops)
WALK = 50
node_list = [node for i, node in graph.nodes.items()]
#I think node.step() can be replaced by a uniform random selection from the tree. This would also limit thingy... 
#  the computation skip on the dep tree. Just fix this later maybe. 
#Can also just construct the dataset and load it using a generator. 
#Dataset creation should be paralellizable with distributed or mpi if python has that.
###check if dataset creation for GCN uses (informed)random walk or some random selection from list of nodes.
###I think it's parralel full batch creation... 
#for i in range(WALK):
while node_list:
    target_node = node_list.pop()
    r =  random.random() < 0.01
    #if i == 0:
    batch = bfs_dep_tree(target_node, k_hops)#,start_tree
    # else:
    #     skip_edge = (start_edge, last_node)
    #     batch, step_queue = bfs_dep_tree_computation_skip(start_tree, target_node, skip_edge, k_hops)
    print(f"node: {target_node.id}: embedding: {target_node.embedding}") if r else 5
    updated_embedding = r_gcn(batch)
    print(f"node: {target_node.id}: embedding: {target_node.embedding}") if r else 5
    if r:
        break
    # try:
    #     last_node = target_node
    #     target_node, edge_label = target_node.step()
    # except:
    #     try:
    #         last_node = target_node
    #         target_node, edge_label = target_node.step()
    #     except:
    #         print(target_node.children)

ValueError: too many values to unpack (expected 2)

In [55]:
ml4g/dataset

'/home/royal-cookings/Downloads'

In [None]:
folder_loc = "./ml4g/dataset" 
import os
os.makedirs(folder_loc, exist_ok=True)
import pickle
from tqdm import tqdm

graph.initialize_label_weights()
predicted_node = graph.nodes[0]
k_hop = 5
r_gcn = R_GCN(x_dim=EMBEDDING_SIZE, y_dim=EMBEDDING_SIZE, graph=graph, max_k_hop=k_hop)
WALK = 50
node_list = [node for i, node in graph.nodes.items()]
NUM_CLASSES = 15

def pkll(path):
    with open(path, 'rb') as f:
        var = pickle.load(f)
    return var

def pkld(var,path):
    with open(path, 'wb') as f:
        pickle.dump(var, f)
 

def create_dataset(graph):
    while node_list:
        predicted_node = node_list.pop()
        r =  random.randint(0,100)
        batch = bfs_dep_tree(predicted_node, k_hop)
        id_ = predicted_node.id
        x, y = batch
        pkld(x,f"{folder_loc}/{id_}x.pkl")
        pkld(y,f"{folder_loc}/{id_}y.pkl")


class Generator:
    def __init__(self, folder_loc):
        self.folder_loc = folder_loc
        self.len = 0

    def __len__(self):
        return self.len

    def __iter__(self):
        return self.generate_micro_batches()
    
    def generate_micro_batches(self):
        x_files = [os.path.join(self.folder_loc,file) for file in os.listdir(self.folder_loc) if file.endswith('x.pkl')]
        y_files = [os.path.join(self.folder_loc,file) for file in os.listdir(self.folder_loc) if file.endswith('y.pkl')]
        self.len = len(x_files)
        for xpkl, ypkl in zip(x_files,y_files):
            yield pkll(xpkl), pkll(ypkl)

if not os.listdir(folder_loc):
    create_dataset(graph)
    print('creating')

generator = Generator(folder_loc)
r_gcn = R_GCN(x_dim=EMBEDDING_SIZE, y_dim=DIM_W, graph=graph, max_k_hop=k_hop)
classifier = NodeClassifier(x_dim=DIM_W, num_classes=NUM_CLASSES)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

r_gcn.to(device)
classifier.to(device)
criterion = nn.CrossEntropyLoss()
params = list(r_gcn.parameters())
params.extend(list(classifier.parameters()))
optimizer = torch.optim.Adam(params,lr=0.01)

num_epochs = 10
batch_size = 16

def save_gradient_hook(grad):
    gradients.append(grad)
    

for epoch in range(num_epochs):
    r_gcn.train()
    classifier.train()
    epoch_loss = 0.0
    correct = 0
    total = 0
    i = 0
    # gradients = [] # can do some processing here to create alpha score from backpack library.
    for batch, label in tqdm(generator, desc=f"epoch {epoch}/{num_epochs}"):
        label = torch.tensor([label])
        true_label = label.long().to(device)
        predicted_embedding, _ = r_gcn(batch)
        predicted_embedding = predicted_embedding.unsqueeze(0)
        predicted_label = classifier(predicted_embedding)
        
        loss = criterion(predicted_label, true_label)

        # for param in params:
        #     param.register_hook(save_gradient_hook)
        
        if i % batch_size == 0:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        i += 1
        epoch_loss += loss.item()
        _, predicted = torch.max(predicted_label.data, 1)
        total += 1
        correct += 1 if predicted == true_label else 0
    avg_loss = epoch_loss / generator.len
    accuracy = 100 * correct / total
    print(f"epoch: {epoch}\navg loss: {avg_loss}accuracy: {accuracy}")

epoch 0/10: 17it [00:07,  1.75it/s]

In [164]:
a = torch.tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1026, 0.2904,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0757, 0.0000, 0.4580, 0.0000, 0.7416,
        0.0000, 0.0000, 0.0000, 0.5146, 0.0000, 0.2533, 0.0000, 0.0000, 0.0000,
        0.0000, 0.5674, 0.2455, 0.0000, 0.1071, 0.0000, 0.5363, 0.8724, 0.3041,
        0.4951, 0.0000, 0.0000, 0.2391, 0.1610, 0.5132, 0.0000, 0.0000, 0.4578])

In [165]:
a = a.unsqueeze(0)
b = a
c = torch.cat((a,b),dim=0)
c.size()

torch.Size([2, 45])

In [148]:
torch.sum(a,dim=0)

tensor(6.9352)

In [33]:
# def generate_micro_batches(folder_loc):
#     files = [os.path.join(folder_loc,file) for file in os.listdir(folder_loc) if file.endswith('.pkl')]
#     for pkl in files:
#         yield pkll(pkl)

# generator = generate_micro_batches(folder_loc)
next(generator)[1]

tensor([0, 0, 0, 1])

In [20]:

one_hot(torch.tensor(14), num_classes=NUM_CLASSES)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [32]:
dataset.__init__(folder_loc)


In [37]:
[os.path.join(folder_loc, file) for file in os.listdir(folder_loc) if file.endswith('.pkl')]

[]

In [33]:
dataset.__len__()

0

## will make some parallel processing functions here once I understand the full training process...

In [5]:


folder_loc = "./ml4g/generator_batch" 
from pickle import pickle
from multiprocessing import Process, Queue
import os

def pkld(var,path):
    with open(path, 'wb') as f:
        pickle.dump(var, f)

def pkll(path):
    with open(path, 'rb') as f:
        var = pickle.load(f)
    return var

def create_generator_folder(folder_loc): #consider modular full batch layer-wise processing if memory dep < some safety margin of ram/vram, etc
    #else: parallel process micro batches if feasible
    i = 0
    while node_list:
        target_node = node_list.pop()
        #if i == 0:
        batch, _ = bfs_dep_tree(target_node, k_hops)
        pkld(batch, folder_loc+f"/micro_batch_{i}.pkl")
        i += 1

def generate_full_batch(folder_loc):
    return [batch for batch in generate_micro_batches]


def generate_micro_batches(folder_loc):
    files = [file for file in os.listdir(folder_loc) if file.endswith('.pkl')]
    for pkl in files:
        yield pkll(pkl)

def distribute_work(num_workers, micro_batches):
    tasks = Queue()
    output = Queue()
    for task in micro_batches:
        tasks.put(task)
        forward_micro_batch(worker_num, task)
    workers = [Process(target=worker_num. args=)]

def forward_micro_batch(worker_num, micro_batch):
    updated_embedding = await(r_gcn(micro_batch))


SyntaxError: invalid syntax (2787165626.py, line 40)

## RDFS Process to Graph() object below...

In [None]:
:)

### cashew: try graphgym package :)
# Proposed GNN architecture:
## Transformation block >>>
### * linear
### * batch norm * <  my intuition is that this could replace mean() operation, just use vectorized sum() to reduce computational complexity.
### * dropout * < On linear layer in the message function
### * activation * < parametric relu = max(x,0) + alpha * min(x,0) ... alpha is trainable.
### * attention * < I'm not sure if relational weights as in RGCN fall into this category. If they are complementary in any way.
## <<< End transformation block
### * aggregation by some problem dependant function i.e. mean(), min/max/avg..._pooling(), lstm(cat(edge_embeddings)) ...
#### aggregation note: inverted degree matrix * adjacency matrix = avg(adjacency matrix)

# classical GCN:

## important design choice here... use batch norm after each layer? Normalize explicitly?
## messages = layer weight * normalized messages from prev layers
## aggregation = sum(messages) --> relu

### todo: add weighted average method. I.e. learnable row vector of size feature dim (must vectorize torch.mean explicitly for this)

# GraphSAGE
## aggregate incoming messages (can be mean(messages, dim=0), can be a max pooling on mlp(message), can be LSTM(shuffle(messages) as mini-batch), can be sum without average (maybe this leads to batch norm later?)
## concat current node message --> relu --> send

## Uses L2 Norm as root squared error of embeddings at every layer.

# GAT

# Architectural notes:

## GCN, but vector weighting matrix is learned (which nodes to attend to and ./ unsure./ which parts of the embedding vectors/features to attend to \.\.

### how can we handle permutation invariance?

### seems that attention weights are graph conditional on search algorithm dependant

### ^ wrong. It's a function of (and on) embeddings of different node embeddings at the previous step.

### softmax ()

### parameter matrix a can be a parameter matrix on a learned single layer mlp that processes the concatenated input vectors.

### parameter matrix a is learned together with weight matrix w.

### multi-head attention, multiple relu(a) matrices.

### each a is initialized randomly, then aggregated to produce a single output

### can be parallellized worker per message.

### sparse matrix... fixed number of parameters.

# Implications/discussion

### asymmetric importance weighting

### weights are still independant of graph size, even though more complex analysis of the graph can be performed.

### graph attention mechanism scales linearly in graph size due to locality

### cool visualization of attention mechanism (implicit clustering) cora citation paper

### improved performance over GCN in some cases.




# Test stuff and transform into main() below...