
Relationship Prediction
===========================================

Notebook for training and inference of element connectivity prediction using GNNs

In [None]:
%matplotlib inline

In [None]:
import dgl
from dgl.data import DGLDataset
import torch
import time
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import math
from pathlib import Path
import scipy.sparse as sp
#from google.colab import drive
import pickle
from tqdm.notebook import tqdm

from src.graph import IndustrialFacilityDataset

### Load graph dataset




In [None]:
# paths

#model_path = '/content/drive/MyDrive/graph/'
model_path = 'gnn_params/'
model_name = "model_4sage_3mlp.pth"
pred_name = "pred_4sage_3mlp.pth"
data_path = "/mnt/c/data/3D_CAD/"


In [None]:
types = ['FLANGE', 'ELBOW', 'TEE', 'TUBE', 'BEND']
np.random.seed(42)
test_mode = False
use_params = True

In [None]:
if not test_mode:
    path = Path('output/west_ref/')
    dataset = IndustrialFacilityDataset(data_path, "westdeckbox_ref", use_params, path, 'west')
else:
    path = Path('output/east_ref/')
    dataset = IndustrialFacilityDataset(data_path, "eastdeckbox", use_params, path, 'east')
g = dataset[0]
print(g)

#### debugging element ids

In [None]:
# node ids don't seem to match - probably the graph dataset was extracted from a different version of ifc file with different element ids
# first, check ids in the merged file - compare with the ids from the param data
# also check ids in the merged file with node dataset

# import ifcopenshell
# from ifcopenshell.util.selector import Selector

# ifc = ifcopenshell.open(data_path +"merged.ifc")

In [None]:
#ifc_tee = ifcopenshell.open(data_path +"deckboxtee_ref.ifc")

In [None]:
# element_type = 'IFCPIPEFITTING'
# #element_type = 'IFCPIPESEGMENT'
# selector = Selector()
# elements = selector.parse(ifc, '.' + element_type)
# print(len(elements))
# ids= []
# for e in elements:
#     ids.append(e.id())
# print(len(ids), ids[:10])

In [None]:
# with open(data_path + 'nodes_westdeckbox_ref.pkl', 'rb') as f:
#     node_info = pickle.load(f)
#     nodes = node_info[0]
    

In [None]:
# ids_tee = [n[4] for n in nodes if n[0]==1]
# #ids_tee = [n[4] for n in nodes]
# print(len(ids_tee))
# # node_ids = [n[4] for n in nodes if n[0]==3 or n[0]==4]
# # print(len(node_ids))

In [None]:

# elements_tee = selector.parse(ifc_tee, '.' + element_type)
# print(len(elements_tee))
# ids_tee= []
# for e in elements_tee:
#     ids_tee.append(e.id())
# print(len(ids_tee), ids_tee[:10], max(ids_tee), max(ids))

In [None]:
# matches, non_matches = [], []

# for id1 in tqdm(ids_tee):
#     found = False
#     for i, id2 in enumerate(ids):
#         if id1 == id2:
#             matches.append((id1, i))
#             found = True
#             break
#     if not found:
#         non_matches.append(id1)
               
# print(len(matches), len(non_matches))


Prepare training and testing sets
---------------------------------

This cell randomly picks 10% of the edges for positive examples in
the test set, and leaves the rest for the training set. It then samples
the same number of edges for negative examples in both sets.

Ignore for evaluation.



In [None]:
# Split edge set for training and testing
if not test_mode:
    u, v = g.edges()

    eids = np.arange(g.number_of_edges())
    eids = np.random.permutation(eids)
    test_size = int(len(eids) * 0.1)
    train_size = g.number_of_edges() - test_size
    test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
    train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

    # Find all negative edges and split them for training and testing
    #print(u.numpy().shape, v.numpy().shape)

    adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), 
                        shape=(g.number_of_nodes(), g.number_of_nodes()))
    #print(adj.shape, adj.todense().shape, np.eye(g.number_of_nodes()).shape)
    adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
    neg_u, neg_v = np.where(adj_neg != 0)


In [None]:
def construct_negative_graph(g, total_train_neg_eids, train_size):
#     t1 = time.perf_counter()
    train_neg_eids_eids = np.random.choice(len(total_train_neg_eids), train_size)
    train_neg_eids = total_train_neg_eids[train_neg_eids_eids]
    train_neg_u = neg_u[train_neg_eids] 
    train_neg_v = neg_v[train_neg_eids]
    train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())
#     t2 = time.perf_counter()
#     print("time", t2-t1)
#     print(train_neg_eids[:10])
    return train_neg_g


In [None]:
if not test_mode:
    test_neg_eids = np.random.choice(len(neg_u), test_size)
    test_neg_eids = np.sort(test_neg_eids)

    total_train_neg_eids = []
    k = 0
    for i in tqdm(range(len(neg_u))):
        if k==len(test_neg_eids):
            total_train_neg_eids.append(i)
            continue
        if i != test_neg_eids[k]:
            total_train_neg_eids.append(i)
        else:
            k+=1
    print(len(test_neg_eids) + len(total_train_neg_eids) - len(neg_u))
    
    test_neg_u, test_neg_v = neg_u[test_neg_eids], neg_v[test_neg_eids]
    total_train_neg_eids = np.array(total_train_neg_eids)

    #train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [None]:
# remove edges of testset for training
if not test_mode:
    train_g = dgl.remove_edges(g, eids[:test_size])

In [None]:
# construct the positive graph and the negative graph for the training set and the test set respectively.
if not test_mode:
    train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
    #train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

    test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
    test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

GraphSAGE model
-------------------





In [None]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a 3-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
        self.conv3 = SAGEConv(h_feats, h_feats, 'mean')
#         self.conv4 = SAGEConv(h_feats, h_feats, 'mean')
#         self.conv5 = SAGEConv(h_feats, h_feats, 'mean')
#         self.conv6 = SAGEConv(h_feats, h_feats, 'mean')
#         self.conv7 = SAGEConv(h_feats, h_feats, 'mean')
#         self.conv8 = SAGEConv(h_feats, h_feats, 'mean')
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        h = F.relu(h)
        h = self.conv3(g, h)
#         h = F.relu(h)
#         h = self.conv4(g, h)
#         h = F.relu(h)
#         h = self.conv5(g, h)
#         h = F.relu(h)
#         h = self.conv6(g, h)
#         h = F.relu(h)
#         h = self.conv7(g, h)
#         h = F.relu(h)
#         h = self.conv8(g, h)
        return h

In [None]:
# compute edge features using dot product

import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [None]:
# ALTERNATIVE: compute edge features using an MLP

class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
#         self.W1 = nn.Linear(h_feats * 2, h_feats)
#         self.W2 = nn.Linear(h_feats, 1)

#         self.W1 = nn.Linear(h_feats * 2, h_feats)
#         self.W2 = nn.Linear(h_feats, int(h_feats/2))
#         self.W3 = nn.Linear(int(h_feats/2), 1)

        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, int(h_feats/2))
        self.W3 = nn.Linear(int(h_feats/2), 4)
        self.W4 = nn.Linear(4, 1)

#         self.W1 = nn.Linear(h_feats*2, h_feats*8)
#         self.W2 = nn.Linear(h_feats*8, h_feats*2)
#         self.W3 = nn.Linear(h_feats*2, int(h_feats/2))
#         self.W4 = nn.Linear(int(h_feats/2), 4)
#         self.W5 = nn.Linear(4, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        #return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}
        #return {'score': self.W3(F.relu(self.W2(F.relu(self.W1(h))))).squeeze(1)}
        return {'score': self.W4(F.relu(self.W3(F.relu(self.W2(F.relu(self.W1(h))))))).squeeze(1)}
        #return {'score': self.W5(F.relu(self.W4(F.relu(self.W3(F.relu(self.W2(F.relu(self.W1(h))))))))).squeeze(1)}

        #return {'score': self.W1(h).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

Training setup
-------------



The loss function is binary cross entropy loss.

\begin{align}\mathcal{L} = -\sum_{u\sim v\in \mathcal{D}}\left( y_{u\sim v}\log(\hat{y}_{u\sim v}) + (1-y_{u\sim v})\log(1-\hat{y}_{u\sim v})) \right)\end{align}

The evaluation metric  is AUC.




In [None]:
print(g.ndata['feat'].shape[1])

In [None]:
from sklearn.metrics import roc_auc_score

feat_size = 16

if test_mode:
  model = GraphSAGE(g.ndata['feat'].shape[1], feat_size)
else:
  model = GraphSAGE(train_g.ndata['feat'].shape[1], feat_size)

model = model.to(torch.double)
# You can replace DotPredictor with MLPPredictor.
pred = MLPPredictor(feat_size)
# pred = DotPredictor()
pred = pred.to(torch.double)


In [None]:
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    #print(scores)
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    #w = torch.as_tensor([0.01 for i in range(len(labels))])
    #return F.binary_cross_entropy_with_logits(scores, labels, w)
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [None]:
from numpy.lib.function_base import average
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pos_score, neg_score, threshold=0.5):
    # for x in torch.cat([pos_score, neg_score]):
    #   print(x)

    scores = torch.sigmoid(torch.cat([pos_score, neg_score])).numpy()
    #scores = np.rint(scores).astype(int)
    scores[scores > threshold] = 1
    scores[scores != 1] = 0
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    #print(np.average(scores), scores.shape, labels.shape)
    return (accuracy_score(labels, scores), precision_score(labels, scores), 
            recall_score(labels, scores), f1_score(labels, scores))


### Evaluation 
load pre-trained model and run on entire dataset in batches (due to the large size of negative dataset)

In [None]:
if test_mode:

    # load model
    model.load_state_dict(torch.load(model_path + model_name))
    pred.load_state_dict(torch.load(model_path + pred_name))

In [None]:
# validation set
if test_mode:
  u, v = g.edges()
  eids = np.arange(g.number_of_edges())
  
  adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), 
                      shape=(g.number_of_nodes(), g.number_of_nodes()))
  adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
  neg_u_full, neg_v_full = np.where(adj_neg != 0)
  print(len(neg_u_full))
  #neg_eids = np.random.choice(len(neg_u), g.number_of_edges())


In [None]:
if test_mode:    
    with open(model_path + 'eval/pos_edges_test.pkl', 'wb') as f:
                pickle.dump([u,v], f)
    with open(model_path + 'eval/neg_edges_test.pkl', 'wb') as f:
                pickle.dump([neg_u_full,neg_v_full], f)
    print(len(u), len(neg_u_full))

In [None]:
if test_mode:
    with torch.no_grad():
        h = model(g, g.ndata['feat'])
        
        # evaluate positive examples
        val_pos_g = dgl.graph((u, v), num_nodes=g.number_of_nodes())
        pos_score = pred(val_pos_g, h).numpy()
        with open(model_path + '/eval/pos_score_test.pkl', 'wb') as f:
            pickle.dump(pos_score, f)

        # evaluate negative examples
        sample_size = 10000000
        limit = math.ceil(len(neg_u_full)/sample_size)
        #print(len(neg_u_full), limit)
        neg_scores = []
        for i in range(limit):
            if i == limit-1:
                neg_u, neg_v = neg_u_full[i*sample_size:], neg_v_full[i*sample_size:]
            else:
                neg_u = neg_u_full[i*sample_size:(i+1)*sample_size]
                neg_v = neg_v_full[i*sample_size:(i+1)*sample_size]
            print(i, len(neg_u), neg_u[0],  neg_v[0])

            val_neg_g = dgl.graph((neg_u, neg_v), num_nodes=g.number_of_nodes())
            neg_scores.append(pred(val_neg_g, h).numpy())
        
        neg_score = np.concatenate(neg_scores)
        with open(model_path + '/eval/neg_score_test.pkl', 'wb') as f:
            pickle.dump(neg_score, f)
#             print('AUC', compute_auc(pos_score, neg_score))

In [None]:
if test_mode:

    with open(model_path + '/eval/pos_score_test.pkl', 'rb') as f:
        pos_score = pickle.load(f)

    with open(model_path + '/eval/neg_score_test.pkl', 'rb') as f:
        neg_score = pickle.load(f)

In [None]:
if test_mode:
    print(len(pos_score), len(neg_score), sum(pos_score)/len(pos_score), sum(neg_score)/len(neg_score), neg_score[10])

In [None]:
if test_mode:
    metrics = compute_metrics(torch.from_numpy(pos_score), torch.from_numpy(neg_score))
    print('accuracy', metrics[0])
    print('precision', metrics[1])
    print('recall', metrics[2])
    print('f1_score', metrics[3])

### train / test

In [None]:
# ----------- 3. set up loss and optimizer -------------- #
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.001)

# ----------- 4. training -------------------------------- #
all_logits = []
accuracies = []
losses = []

for e in tqdm(range(1000)):
    # forward
    #print(len(c))
    train_neg_g = construct_negative_graph(g, total_train_neg_eids, train_size)
    
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    
    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if e % 5 == 0:
        with torch.no_grad():
          pos_score = pred(test_pos_g, h)
          neg_score = pred(test_neg_g, h)
          auc =  compute_auc(pos_score, neg_score)
          metrics = compute_metrics(pos_score, neg_score)
          test_loss = compute_loss(pos_score, neg_score)
          print('epoch {}, training loss: {:.3f}, test loss: {:.3f}, auc: {:.3f}, f1: {:.3f}'.format(e, loss, test_loss, auc, float(metrics[3])))

          # metrics = compute_metrics(pos_score, neg_score)
          accuracies.append(auc)
          losses.append(loss.item())


# ----------- 5. check results ------------------------ #
print("max AUC", max(accuracies))
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))
    
# for i in range(len(losses)):
#     print(i*5, losses[i], accuracies[i])
#     print('AUC', compute_auc(pos_score, neg_score))



In [None]:

with torch.no_grad():
    #h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    metrics = compute_metrics(pos_score, neg_score)
    print('accuracy', metrics[0])
    print('precision', metrics[1])
    print('recall', metrics[2])
    print('f1_score', metrics[3])
    print('auc', compute_auc(pos_score, neg_score))

In [None]:
print(len(test_pos_u), len(test_neg_u))

In [None]:
# calculate separate metrics based on element classes
# print(g.ndata["feat"][test_pos_u[0]])
# print(np.unique(g.ndata["feat"].numpy()[:,0]))

type_pairs = [(0., 0.), (0., 1.), (0., 2.), (0., 3.), (0., 4.), 
              (1., 1.), (1., 2.), (1., 3.), (1., 4.), (2., 2.), 
              (2., 3.), (2., 4.), (3., 3.), (3., 4.), (4., 4.)]

def split_graph(u, v, node_features, type_pairs):
    subgraph_edges = {pair: ([], []) for pair in type_pairs}
    subgraphs = {}
    
    for i in range(len(u)):
        u_type = node_features[u[i], 0]
        v_type = node_features[v[i], 0]
        
        edge_type = (u_type, v_type)
        if edge_type in subgraph_edges:
            subgraph_edges[edge_type][0].append(u[i])
            subgraph_edges[edge_type][1].append(v[i])

    
    for pair, (sub_u, sub_v) in subgraph_edges.items():
        pos_g = dgl.graph((sub_u, sub_v), num_nodes=g.number_of_nodes())
        #test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())
        subgraphs[pair] = pos_g
        
    return subgraphs

pos_subgraphs = split_graph(test_pos_u, test_pos_v, g.ndata["feat"].numpy(), type_pairs)
neg_subgraphs = split_graph(test_neg_u, test_neg_v, g.ndata["feat"].numpy(), type_pairs)
#print(neg_subgraphs)


for pair, pos_sub_g in pos_subgraphs.items():
    print((types[int(pair[0])], types[int(pair[1])]))
    print("positive edges", pos_sub_g.num_edges())
    print("negative edges", neg_subgraphs[pair].num_edges())
#     print(f"Subgraph for type pair {pair}:")
#     print(f"u: {sub_u}")
#     print(f"v: {sub_v}\n")

    with torch.no_grad():
        pos_score = pred(pos_sub_g, h)
        neg_score = pred(neg_subgraphs[pair], h)
        try:
            metrics = compute_metrics(pos_score, neg_score)
            print('accuracy', metrics[0])
            print('precision', metrics[1])
            print('recall', metrics[2])
            print('f1_score', metrics[3])
            print('auc', compute_auc(pos_score, neg_score))
        except:
            print("metric error")



In [None]:
# save model
# torch.save(model.state_dict(), model_path + model_name)
# torch.save(pred.state_dict(), model_path + pred_name)