# Graph Neural Network Using Neo4J & PyTorch

### Environment

In [146]:
import pandas as pd

import matplotlib 
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from py2neo import Graph

In [15]:
graph = Graph("bolt://54.87.236.196:37389", auth=("neo4j", "throttle-retailers-jobs"))

#### EDA

In [46]:
query = """
MATCH (p:Person) RETURN count(*) as numNodes
"""
numNodes = graph.run(query).to_data_frame().iloc[0]['numNodes']
numNodes

34

In [6]:
# Check if there are self loops in the graph
query = """
MATCH (p:Person)-[]-(p)
RETURN p.id
"""
graph.run(query).to_data_frame()

In [10]:
# Find all ids of the nodes in the grapf
query = """
MATCH (p:Person)
RETURN collect(p.id) as allIds
"""
graph.run(query).to_data_frame().iloc[0]['allIds']

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34]

## Graph Operations and Functions

Setting features of all the nodes using a dictionary

In [216]:
# aux function 
def string_to_array(word):
    return [float(i) for i in word]

def array_to_string(array):
    word = ""
    for i in range(0, array.size):
        word += str(array[i])
    return word

def tensor_to_string(tensor):
    word = ""
    for i in tensor.tolist():
        word += str(i) + ","
    return word[:len(word)-1]

def string_to_tensor(string_input):
    return torch.tensor(list(map(float, string_input.split(","))))

In [387]:
def set_features(graph, features):
    for i in range(features.shape[0]):
        query = """
        MATCH (p:Person {id: $id_node}) SET p.features = $features
        """
        graph.run(query, {'id_node': i+1, 'features': tensor_to_string(features[i])})


In [511]:
features = torch.eye(34)
set_features(graph, features)

Either add self loops in the graph or take this into account when updating features of each node

Current approach:
- Self-loops present in the graph deleted
- When updating features, take into account own features

Future approach: 
- Add self-loops to the graph
- When updating features, only use neighbor features

In [522]:
def get_features(graph, id_node):
    query_own_features = """
    MATCH (p:Person {id: $id_node}) RETURN p.features as own_features
    """  
    own_features = graph.run(query_own_features, {"id_node": id_node}).to_data_frame().iloc[0]['own_features']
    return own_features

def get_features_tensor(graph, id_node):
    return string_to_tensor(get_features(graph, id_node))

def get_neighbors_features(graph, id_node):
    query_neighbors_features = """
    MATCH (p:Person {id: $id_node})-[:KNOWS]-(neighbor:Person)
    RETURN collect(neighbor.features) as neighbors_features
    """
    neighbor_features = graph.run(query_neighbors_features, {"id_node": id_node}).to_data_frame().iloc[0]['neighbors_features']
    return neighbor_features

def get_neighbors_features_tensor(graph, id_node):
    return list(map(string_to_tensor, get_neighbors_features(graph, id_node)))

def get_all_features(graph): 
    all_features = []
    all_neighbor_features = {}
    for i in range(1, numNodes + 1):
        i_features = get_features_tensor(graph, i)
        all_features.append(i_features)
        neighbor_features = get_neighbors_features_tensor(graph, i)
        all_neighbor_features[i] = neighbor_features
    return torch.stack(all_features, dim=0), all_neighbor_features

def update_features(graph, id_node, all_features, all_neighbor_features):
    own_features = all_features[id_node-1]
    neighbors_features = all_neighbor_features[id_node]
    neighbors_features_sum = torch.stack(neighbors_features, dim=0).sum(dim=0)
    new_features = torch.stack([neighbors_features_sum, own_features], dim=0).sum(dim=0)
    query_update_features = """
    MATCH (p:Person {id: $id_node}) SET p.features = $new_features
    """
    graph.run(query_update_features, {'id_node': id_node, 'new_features': tensor_to_string(new_features)})
    
def update_all_features(graph, all_features, all_neighbor_features):
    for i in range(numNodes):
        update_features(graph, i+1, all_features, all_neighbor_features)

In [311]:
query_own_features = """
    MATCH (p:Person {id: $id_node}) RETURN p.features as own_features
    """  
own_features = graph.run(query_own_features, {"id_node": 34}).to_data_frame()
own_features

Unnamed: 0,own_features
0,


## Neural Network Creation

In [122]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [518]:
class GCNLayer(nn.Module):
    def __init__(self, input_size, output_size):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    def forward(self, graph, input_features):
        set_features(graph, input_features)
        all_features, all_neighbor_features = get_all_features(graph)
        update_all_features(graph, all_features, all_neighbor_features)
        all_features_updated, _ = get_all_features(graph)
        # Transform into torch format
        return self.linear(all_features_updated)


In [519]:
class GCN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GCN, self).__init__()
        self.gcnlayer1 = GCNLayer(input_size, hidden_size)
        self.gcnlayer2 = GCNLayer(hidden_size, output_size)
    
    def forward(self, graph, input_features):
        outputlayer1 = self.gcnlayer1(graph, input_features)
        outputlayer1_relu = F.relu(outputlayer1)
        outputlayer2 = self.gcnlayer2(graph, outputlayer1_relu)
        return outputlayer2

### Training

Setting labels por node 1 and 34 (semisupervised problem)

In [530]:
query_instructor = """
MATCH (p:Person) WHERE p.id = 1 SET p.label = 0 
"""
query_president = """
MATCH (p:Person) WHERE p.id = 34 SET p.label = 1 
"""
graph.run(query_instructor)
graph.run(query_president)

<py2neo.database.Cursor at 0x7fb6b2659250>

In [550]:
all_logits = []
num_epochs = 2
my_net = GCN(34,5,2)
inputs = torch.eye(34)
logits1 = my_net(graph, inputs)

In [610]:
inputs = torch.eye(34)
set_features(graph, input_features)

In [600]:
inputs = torch.eye(34)
set_features(graph, input_features)
all_features, all_neighbor_features = get_all_features(graph)
update_all_features(graph, all_features, all_neighbor_features)
all_features_updated, _ = get_all_features(graph)

In [609]:
all_features_updated.sum(dim=1)

tensor([17., 10., 11.,  7.,  4.,  5.,  5.,  5.,  6.,  3.,  4.,  2.,  3.,  6.,
         3.,  3.,  3.,  3.,  3.,  4.,  3.,  3.,  3.,  6.,  4.,  4.,  3.,  5.,
         4.,  5.,  5.,  7., 13., 18.])

In [559]:
labels = torch.tensor([0, 1])
labeled_nodes = torch.tensor([0, 33])

In [571]:
my_nn = GCN(34,5,2)
inputs = torch.eye(34)

In [572]:
# Optimizer: Adam
optimizer = torch.optim.Adam(my_nn.parameters(), lr = 0.01)

In [573]:
all_logits = []
num_epochs = 20
for epoch in range(num_epochs):
    logits = my_nn(graph, inputs)
    all_logits.append(logits.detach())
    logp = F.softmax(logits, 1)
    loss = F.nll_loss(logp[labeled_nodes], labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print('Epoch %d | Loss: %.4f' % (epoch, loss.item()))

Epoch 0 | Loss: -0.6186
Epoch 1 | Loss: -0.6291
Epoch 2 | Loss: -0.6400
Epoch 3 | Loss: -0.6510
Epoch 4 | Loss: -0.6623
Epoch 5 | Loss: -0.6736
Epoch 6 | Loss: -0.6850
Epoch 7 | Loss: -0.6963
Epoch 8 | Loss: -0.7076
Epoch 9 | Loss: -0.7188
Epoch 10 | Loss: -0.7297
Epoch 11 | Loss: -0.7404
Epoch 12 | Loss: -0.7508
Epoch 13 | Loss: -0.7608
Epoch 14 | Loss: -0.7704
Epoch 15 | Loss: -0.7797
Epoch 16 | Loss: -0.7885
Epoch 17 | Loss: -0.7968
Epoch 18 | Loss: -0.8047
Epoch 19 | Loss: -0.8122


In [585]:
logp1 = F.softmax(logits, dim=1)

In [587]:
logp1[labeled_nodes]

tensor([[0.7730, 0.2270],
        [0.1486, 0.8514]], grad_fn=<IndexBackward>)

In [590]:
labels

tensor([0, 1])

In [588]:
F.nll_loss(logp1[labeled_nodes], labels)

tensor(-0.8122, grad_fn=<NllLossBackward>)

In [596]:
inp = torch.randn(3, 5, requires_grad=True)
# each element in target has to have 0 <= value < C
target = torch.tensor([1, 0, 4])
output = F.nll_loss(F.log_softmax(inp, 1), target)
F.log_softmax(inp, 1)

tensor([[-2.0304, -0.6356, -1.8768, -2.2739, -2.4874],
        [-0.3519, -2.8261, -1.9232, -3.0716, -3.1026],
        [-3.3987, -0.9454, -3.4870, -1.9780, -0.8938]],
       grad_fn=<LogSoftmaxBackward>)

In [598]:
F.softmax(logits,1)

tensor([[0.7730, 0.2270],
        [0.4679, 0.5321],
        [0.5379, 0.4621],
        [0.6119, 0.3881],
        [0.8177, 0.1823],
        [0.8443, 0.1557],
        [0.8530, 0.1470],
        [0.6300, 0.3700],
        [0.3395, 0.6605],
        [0.4866, 0.5134],
        [0.8121, 0.1879],
        [0.6582, 0.3418],
        [0.6606, 0.3394],
        [0.6183, 0.3817],
        [0.4536, 0.5464],
        [0.5186, 0.4814],
        [0.7006, 0.2994],
        [0.6531, 0.3469],
        [0.4145, 0.5855],
        [0.6335, 0.3665],
        [0.4374, 0.5626],
        [0.6761, 0.3239],
        [0.5190, 0.4810],
        [0.4872, 0.5128],
        [0.5415, 0.4585],
        [0.5129, 0.4871],
        [0.5151, 0.4849],
        [0.4072, 0.5928],
        [0.4790, 0.5210],
        [0.4369, 0.5631],
        [0.2776, 0.7224],
        [0.7350, 0.2650],
        [0.1232, 0.8768],
        [0.1486, 0.8514]], grad_fn=<SoftmaxBackward>)