# Graph Neural Network Using Neo4J & PyTorch

Implementation of a classifier using karate data. Graph is stored in Neo4J and the graph neural network is implemented using PyTorch

### Environment

In [1]:
import pandas as pd

import matplotlib 
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from py2neo import Graph

In [None]:
# Change ip, port and pwd values
graph = Graph("bolt://54.87.236.196:37389", auth=("neo4j", "hello-password"))

#### EDA

In [93]:
# How many nodes are in the graph
query = """
MATCH (p:Person) RETURN count(*) as numNodes
"""
numNodes = graph.run(query).to_data_frame().iloc[0]['numNodes']
numNodes

34

In [6]:
# Check if there are self loops in the graph
query = """
MATCH (p:Person)-[]-(p)
RETURN p.id
"""
graph.run(query).to_data_frame()

In [7]:
# Find all ids of the nodes in the grapf
query = """
MATCH (p:Person)
RETURN collect(p.id) as allIds
"""
graph.run(query).to_data_frame().iloc[0]

allIds    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
Name: 0, dtype: object

In [94]:
# Obtain {node, [its neighbors], number of neighbors}
query = """
MATCH (n:Person)-[]-(nn:Person)
RETURN n.id, collect(nn.id) as neighbors, count(nn) as numNeighbors
ORDER BY n.id ASC
"""
graph.run(query).to_data_frame()

Unnamed: 0,n.id,neighbors,numNeighbors
0,1,"[2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 18, 2...",16
1,2,"[1, 3, 4, 8, 14, 18, 20, 22, 31]",9
2,3,"[1, 2, 4, 8, 9, 10, 14, 28, 29, 33]",10
3,4,"[1, 2, 3, 8, 13, 14]",6
4,5,"[1, 7, 11]",3
5,6,"[1, 7, 11, 17]",4
6,7,"[1, 5, 6, 17]",4
7,8,"[1, 2, 3, 4]",4
8,9,"[1, 3, 31, 33, 34]",5
9,10,"[3, 34]",2


## Graph Operations and Functions

Aux functions: transformation from tensor to string, and viceversa.

In [9]:
def tensor_to_string(tensor):
    word = ""
    for i in tensor.tolist():
        word += str(i) + ","
    return word[:len(word)-1]

def string_to_tensor(string_input):
    return torch.tensor(list(map(float, string_input.split(","))))

In [10]:
def set_features(graph, features):
    for i in range(features.shape[0]):
        query = """
        MATCH (p:Person {id: $id_node}) SET p.features = $features
        """
        graph.run(query, {'id_node': i+1, 'features': tensor_to_string(features[i])})


Functions to interact with the graph:
- get_features(graph, id_node): Obtain features (string) of node id_node
- get_features_tensor(graph, id_node): Obtain features (tensor) of node id_node
- get_neighbors_features(graph, id_node): Obtain features (string) of node id_node neighbors
- get_neighbors_features_tensor(graph, id_node): Obtain features (tensor) of node id_node neighbors
- get_all_neighbors_features(graph, id_node): Obtain a dictionary where the keys are nodes' id_node and values are its corresponding neighbors features
- update_features(graph, id_node, all_features, all_neighbor_features): This function performs the "message passing" phase to obtain the new features of node id_node. This is achieved by adding its own features and its neighbor features.
- update_all_features((graph, all_features, all_neighbor_features): This functions udpates all nodes' features using the previous function. 

In [88]:
def get_features(graph, id_node):
    query_own_features = """
    MATCH (p:Person {id: $id_node}) RETURN p.features as own_features
    """  
    own_features = graph.run(query_own_features, {"id_node": id_node}).to_data_frame().iloc[0]['own_features']
    return own_features

def get_features_tensor(graph, id_node):
    return string_to_tensor(get_features(graph, id_node))

def get_neighbors_features(graph, id_node):
    query_neighbors_features = """
    MATCH (p:Person {id: $id_node})-[:KNOWS]-(neighbor:Person)
    RETURN collect(neighbor.features) as neighbors_features
    """
    neighbor_features = graph.run(query_neighbors_features, {"id_node": id_node}).to_data_frame().iloc[0]['neighbors_features']
    return neighbor_features

def get_neighbors_features_tensor(graph, id_node):
    return list(map(string_to_tensor, get_neighbors_features(graph, id_node)))

def get_all_neighbor_features(graph): 
    all_neighbor_features = {}
    for i in range(1, numNodes + 1):
        neighbor_features = get_neighbors_features_tensor(graph, i)
        all_neighbor_features[i] = neighbor_features
    return all_neighbor_features

def update_features(graph, id_node, all_features, all_neighbor_features):
    own_features = all_features[id_node-1]
    neighbors_features = all_neighbor_features[id_node]
    neighbors_features_sum = torch.stack(neighbors_features, dim=0).sum(dim=0)
    new_features = torch.stack([neighbors_features_sum, own_features], dim=0).sum(dim=0)
    query_update_features = """
    MATCH (p:Person {id: $id_node}) SET p.features = $new_features
    """
    graph.run(query_update_features, {'id_node': id_node, 'new_features': tensor_to_string(new_features)})
    return new_features
    
def update_all_features(graph, all_features, all_neighbor_features):
    all_features_list = []
    for i in range(numNodes):
        node_features = update_features(graph, i+1, all_features, all_neighbor_features)
        all_features_list.append(node_features)
    return torch.stack(all_features_list)

## Neural Network Creation

Using Pytorch

In [119]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [89]:
class GCNLayer(nn.Module):
    def __init__(self, input_size, output_size):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    def forward(self, graph, input_features):
        set_features(graph, input_features)
        all_neighbor_features = get_all_neighbor_features(graph)
        all_features_updated = update_all_features(graph, input_features, all_neighbor_features)
        return self.linear(all_features_updated)

In [90]:
class GCN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GCN, self).__init__()
        self.gcnlayer1 = GCNLayer(input_size, hidden_size)
        self.gcnlayer2 = GCNLayer(hidden_size, output_size)
    
    def forward(self, graph, input_features):
        outputlayer1 = self.gcnlayer1(graph, input_features)
        outputlayer1_relu = torch.relu(outputlayer1)
        outputlayer2 = self.gcnlayer2(graph, outputlayer1_relu)
        return outputlayer2

### Data preparation and initialization

Setting labels por node 1 and 34 (not necessary)

In [96]:
query_instructor = """
MATCH (p:Person) WHERE p.id = 1 SET p.label = 0 
"""
query_president = """
MATCH (p:Person) WHERE p.id = 34 SET p.label = 1 
"""
graph.run(query_instructor)
graph.run(query_president)

<py2neo.database.Cursor at 0x7f2c32e9e450>

Semisupervised problem, only two nodes are labeled

In [85]:
labels = torch.tensor([0, 1])
labeled_nodes = torch.tensor([0, 33])

For this classification problem, we assign each node’s an input feature as a one-hot vector

In [97]:
inputs = torch.eye(34)

### Training

In [91]:
my_nn = GCN(34,5,2)

In [92]:
optimizer = torch.optim.Adam(my_nn.parameters(), lr=0.01)
all_logits = []
num_epochs = 20
for epoch in range(num_epochs):
    logits = my_nn(graph, inputs)
    all_logits.append(logits.detach())
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp[labeled_nodes], labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print('Epoch %d | Loss: %.4f' % (epoch, loss.item()))

Epoch 0 | Loss: 1.4575
Epoch 1 | Loss: 1.2504
Epoch 2 | Loss: 0.9918
Epoch 3 | Loss: 0.7967
Epoch 4 | Loss: 0.6197
Epoch 5 | Loss: 0.4722
Epoch 6 | Loss: 0.3516
Epoch 7 | Loss: 0.2551
Epoch 8 | Loss: 0.1839
Epoch 9 | Loss: 0.1297
Epoch 10 | Loss: 0.0901
Epoch 11 | Loss: 0.0619
Epoch 12 | Loss: 0.0424
Epoch 13 | Loss: 0.0291
Epoch 14 | Loss: 0.0202
Epoch 15 | Loss: 0.0141
Epoch 16 | Loss: 0.0100
Epoch 17 | Loss: 0.0072
Epoch 18 | Loss: 0.0053
Epoch 19 | Loss: 0.0040


View results

In [100]:
all_logits[num_epochs-1]

tensor([[ 3.0895, -2.4338],
        [ 1.8576, -1.4835],
        [ 0.2205, -1.1235],
        [ 1.7162, -1.3475],
        [ 1.2488, -1.0694],
        [ 1.5597, -1.2470],
        [ 1.5330, -1.2317],
        [ 1.3443, -1.1809],
        [-2.5461,  0.1329],
        [-1.9983,  0.2589],
        [ 1.2855, -1.0903],
        [ 0.7502, -0.7846],
        [ 1.1428, -0.9904],
        [-0.2649, -0.6602],
        [-2.7042,  0.3531],
        [-2.7665,  0.4524],
        [ 0.5877, -0.6918],
        [ 1.1841, -1.0186],
        [-2.7911,  0.4478],
        [-0.4521, -0.5496],
        [-2.8125,  0.4318],
        [ 1.2280, -1.0436],
        [-2.6913,  0.4875],
        [-3.5494,  0.3833],
        [-0.3287, -0.7285],
        [-0.4953, -0.5865],
        [-2.1963,  0.4251],
        [-2.7082,  0.1931],
        [-2.0248,  0.0602],
        [-3.5252,  0.7154],
        [-2.6696,  0.3072],
        [-1.7158, -0.4900],
        [-4.9799,  0.9063],
        [-4.8018,  0.7359]])

In [101]:
pos = {}
for i in range(34):
    pos[i] = all_logits[num_epochs-1][i].numpy().argmax()

In [102]:
classified_nodes = list(pos.values())

The classification obtained for each node:

In [103]:
classified_nodes

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

### Visualizing in Neo4J

Setting labels as properties for each node

In [104]:
for i in range(len(classified_nodes)):
    label_node = str(classified_nodes[i])
    query = """
    MATCH (p:Person {id: $id_node}) SET p.label = $label
    """
    graph.run(query, {'id_node': i+1, 'label': label_node})
