In [2]:
import os.path as osp
from tqdm import tqdm
import numpy as np
import random

import torch
from torch_geometric.datasets import DBLP
from torch_geometric.utils import negative_sampling
from modules.edge_predictor import EdgePredictor

import torch_geometric.transforms as T
import torch.nn.functional as F
import graph_polluters
import graph_learning


path = './data/dblp'
# We initialize conference node features with a single one-vector as feature:
dataset = DBLP(path, transform=T.Constant(node_types='conference'))
data = dataset[0]
print(data)

HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={
    num_nodes=20,
    x=[20, 1],
  },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] }
)


In [3]:
data['author'][''] = torch.zeros_like(data['author'].x)

In [4]:
data

HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
    thing=[4057, 334],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={
    num_nodes=20,
    x=[20, 1],
  },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] }
)

In [10]:
edge_predictor = EdgePredictor(data.metadata(), hidden_channels=10, num_layers=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, edge_predictor = data.to(device), edge_predictor.to(device)

edge_types = [("paper", "to", "author"), ("paper", "to", "term"), ("paper", "to", "conference")]
rev_edge_types = [("author", "to", "paper"), ("term", "to", "paper"), ("conference", "to", "paper")]
split_transform = T.RandomLinkSplit(num_val = 0.2, num_test=0, edge_types=edge_types, rev_edge_types=rev_edge_types)

#todo add train data
train_data, val_data, _ = split_transform(data)

with torch.no_grad():  # Initialize lazy modules.
    out = edge_predictor(data.x_dict, data.edge_index_dict)

optimizer = torch.optim.Adam(edge_predictor.parameters(), lr=0.005, weight_decay=0.001)


def train_predictor(train_data):
    edge_predictor.train()
    optimizer.zero_grad()
    embedding = edge_predictor(train_data.x_dict, train_data.edge_index_dict)

    similarities, labels = test_random_edges(embedding, train_data, 1000, 1000)

    loss = F.binary_cross_entropy(similarities, labels)
    loss.backward()
    optimizer.step()
    return float(loss)

def test_random_edges(embedding, data, num_pos, num_neg):
    '''
    Return the similarity values of positive and negative examples of edges
    '''
    similarities, labels = [],[]
    for edge_type in edge_types:
        # Get positive examples
        indices = torch.randperm(data.edge_index_dict[edge_type].size()[1])[:num_pos]
        pos_examples = data.edge_index_dict[edge_type][:,indices]
        pos_labels = torch.ones(num_pos, device=device)
    
        # Get negative examples
        num_nodes = data[edge_type[0]]['x'].size()[0], data[edge_type[2]]['x'].size()[0]
        neg_examples = negative_sampling(
            data.edge_index_dict[edge_type],
            num_nodes=num_nodes,
            force_undirected=True,
            num_neg_samples=num_neg
        )
        neg_labels = torch.rand(neg_examples.size()[1], device=device)

        # concatanate and shuffle
        indices = torch.randperm(num_pos+num_neg)
        edges = torch.cat((pos_examples, neg_examples), 1)[:,indices]
        labels.append(torch.cat((pos_labels, neg_labels))[indices])
        similarities.append(test_edges_similarity(edges, embedding, edge_type[0], edge_type[2]))
    similarities, labels = torch.concat(similarities), torch.concat(labels)
    return similarities, labels

def test_edges_similarity(edges, embedding, node_type1, node_type2):
        '''
        Find the cosine similarity of the given tensor of edges (from node_type1 to node_type2) 
        with regard to the given embedding
        '''
        # get embeddings for the nodes in the edges
        x1 = torch.index_select(embedding[node_type1], 0, edges[0])
        x2 = torch.index_select(embedding[node_type2], 0, edges[1])
        # calculate cosine similarity
        return torch.cosine_similarity(x1, x2, dim=1)/2 + 0.5

@torch.no_grad()
def test_predictor(test_data):
    edge_predictor.eval()
    embedding = edge_predictor(test_data.x_dict, test_data.edge_index_dict)
    similarities, labels = test_random_edges(embedding, test_data, 1000, 1000)
    loss = F.binary_cross_entropy(similarities, labels)
    return loss

@torch.no_grad()
def add_potential_edges(model, data, similarity_theshold = 0.9, number_to_test=100000):
    embedding = model(train_data.x_dict, train_data.edge_index_dict)
    for edge_type in edge_types:
        # Get negative examples
        num_nodes = data[edge_type[0]]['x'].size()[0], data[edge_type[2]]['x'].size()[0]
        edges = negative_sampling(
            data.edge_index_dict[edge_type],
            num_nodes=num_nodes,
            force_undirected=True,
            num_neg_samples=number_to_test
        )
        similarities = test_edges_similarity(edges, embedding, edge_type[0], edge_type[2])
        new_edges = edges[:,similarities > similarity_theshold]
        data[edge_type].edge_index = torch.cat((data[edge_type].edge_index, new_edges), 1)
        data[edge_type[::-1]].edge_index = torch.cat((data[edge_type[::-1]].edge_index, new_edges[[1,0],:]), 1)
        print(f'Added {new_edges.size()[1]} edges')

In [13]:
train_data[('author', 'to', 'paper')]

{'edge_index': tensor([[ 1039,  1005,  1545,  ...,   274,  3885,  1151],
        [ 1447,  6888,  9418,  ...,   123, 14243, 11934]], device='cuda:0')}

In [4]:
#todo save best model
for epoch in tqdm(range(1, 201)):
    train_loss = train_predictor(train_data)
    test_loss = test_predictor(val_data)

100%|██████████| 200/200 [00:11<00:00, 17.67it/s]


## Baseline

In [6]:
graph_learning.set_seed()
dataset_copy = dataset.copy()
data_copy = dataset_copy[0]
data_copy.to(device)
model, optimizer = graph_learning.init_parameters(data_copy)
train_accs, val_accs, test_accs = [],[],[]
for epoch in range(1, 100):
    loss = graph_learning.train_epoch(data=data_copy, model=model, optimizer=optimizer)
    train_acc, val_acc, test_acc = graph_learning.test_epoch(data = data_copy, model=model)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    test_accs.append(test_acc)
best_epoch = max(enumerate(val_accs),key=lambda x: x[1])[0]
train_acc, val_acc, test_acc = train_accs[best_epoch], val_accs[best_epoch], test_accs[best_epoch]
print(f'Baseline, Train: {train_acc:.4f}, '
        f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Baseline, Train: 1.0000, Val: 0.8025, Test: 0.8090


## 50% Sparse Edges
todo more percentages

In [7]:
graph_learning.set_seed()
dataset_copy = dataset.copy()
data_copy = dataset_copy[0]
data_copy.to(device)
data_copy = graph_polluters.remove_edges(data_copy, 0.5)
model, optimizer = graph_learning.init_parameters(data_copy)
train_accs, val_accs, test_accs = [],[],[]
for epoch in range(1, 100):
    loss = graph_learning.train_epoch(data=data_copy, model=model, optimizer=optimizer)
    train_acc, val_acc, test_acc = graph_learning.test_epoch(data = data_copy, model=model)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    test_accs.append(test_acc)
best_epoch = max(enumerate(val_accs),key=lambda x: x[1])[0]
train_acc, val_acc, test_acc = train_accs[best_epoch], val_accs[best_epoch], test_accs[best_epoch]
print(f'Baseline, Train: {train_acc:.4f}, '
        f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Baseline, Train: 1.0000, Val: 0.7400, Test: 0.7691


## 50% Sparse Edges + Pseudoedges

In [9]:
graph_learning.set_seed()

#copy the dataset
dataset_copy = dataset.copy()
data_copy = dataset_copy[0]
data_copy.to(device)

# remove edges
data_copy = graph_polluters.remove_edges(data_copy, 0.5)

# add potential edges
add_potential_edges(edge_predictor, data_copy, 0.9)

model, optimizer = graph_learning.init_parameters(data_copy)
train_accs, val_accs, test_accs = [],[],[]
for epoch in range(1, 100):
    loss = graph_learning.train_epoch(data=data_copy, model=model, optimizer=optimizer)
    train_acc, val_acc, test_acc = graph_learning.test_epoch(data = data_copy, model=model)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    test_accs.append(test_acc)
best_epoch = max(enumerate(val_accs),key=lambda x: x[1])[0]
train_acc, val_acc, test_acc = train_accs[best_epoch], val_accs[best_epoch], test_accs[best_epoch]
print(f'Baseline, Train: {train_acc:.4f}, '
        f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Added 4176 edges
Added 2622 edges
Added 5445 edges
Baseline, Train: 1.0000, Val: 0.7300, Test: 0.7749
