In [12]:
import torch
from torch_geometric.datasets import DBLP
from torch_geometric.utils import negative_sampling, to_dense_adj
from edge_predictor import EdgePredictor

import torch_geometric.transforms as T
import torch.nn.functional as F
import graph_polluters
import graph_learning
import matplotlib.pyplot as plt
from copy import deepcopy

path = './data/dblp'
# We initialize conference node features with a single one-vector as feature:
dataset = DBLP(path, transform=T.Constant(node_types='conference'))
data = dataset[0]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Get conferences where authors have had papers in

In [3]:
author_to_paper = to_dense_adj(data['author', 'to', 'paper'].edge_index)[:,:4057,:].squeeze().detach().clone()
paper_to_conference = to_dense_adj(data['paper', 'to', 'conference'].edge_index)[:,:,:20].squeeze().detach().clone()

In [4]:
author_to_conference = author_to_paper @ paper_to_conference

## Use cosine similarity to tell wheter to add new edge

In [5]:
def make_similar_edges(adjacency_matrix, min_similarity):
    adjacency_matrix = adjacency_matrix.to(device)

    indices = torch.combinations(torch.arange(adjacency_matrix.shape[0])).to(device)
    first_embeddings = adjacency_matrix[indices[:,0]]
    second_embeddings = adjacency_matrix[indices[:,1]]
    similarities = torch.nn.functional.cosine_similarity(first_embeddings, second_embeddings, dim=1)
    return indices[similarities > min_similarity].T

In [6]:
new_edges = make_similar_edges(author_to_conference, min_similarity=0.99)
new_edges.shape

torch.Size([2, 273191])

In [7]:
data['author', 'to', 'author'].edge_index = new_edges

In [8]:
data

HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={
    num_nodes=20,
    x=[20, 1],
  },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] },
  (author, to, author)={ edge_index=[2, 273191] }
)

In [9]:
graph_learning.set_seed()
data_copy = deepcopy(data)
data_copy.to(device)
model, optimizer = graph_learning.init_parameters(data_copy, device)
train_accs, val_accs, test_accs = [],[],[]
for epoch in range(1, 100):
    loss = graph_learning.train_epoch(data=data_copy, model=model, optimizer=optimizer)
    train_acc, val_acc, test_acc = graph_learning.test_epoch(data = data_copy, model=model)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    test_accs.append(test_acc)
best_epoch = max(enumerate(val_accs),key=lambda x: x[1])[0]
train_acc, val_acc, test_acc = train_accs[best_epoch], val_accs[best_epoch], test_accs[best_epoch]
print(f'Enhanced Baseline: Best Epoch: {best_epoch} Train: {train_acc:.4f}, '
        f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Enhanced Baseline: Best Epoch: 16 Train: 1.0000, Val: 0.9025, Test: 0.8978


In [10]:
graph_learning.set_seed()
data_copy = deepcopy(data)
data_copy.to(device)
data_copy = graph_polluters.remove_edges(data_copy, 0.5)
model, optimizer = graph_learning.init_parameters(data_copy, device)
train_accs, val_accs, test_accs = [],[],[]
for epoch in range(1, 100):
    loss = graph_learning.train_epoch(data=data_copy, model=model, optimizer=optimizer)
    train_acc, val_acc, test_acc = graph_learning.test_epoch(data = data_copy, model=model)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    test_accs.append(test_acc)
best_epoch = max(enumerate(val_accs),key=lambda x: x[1])[0]
train_acc, val_acc, test_acc = train_accs[best_epoch], val_accs[best_epoch], test_accs[best_epoch]
print(f'Enhanced Sparsified: Best Epoch: {best_epoch} Train: {train_acc:.4f}, '
        f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Enhanced Sparsified: Best Epoch: 10 Train: 0.9925, Val: 0.9000, Test: 0.9021
