In [1]:
import numpy as np
import os

import torch
import torch_geometric
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [6]:
data_home = "/lyceum/jhk1c21/msc_project/data"
graph_dir = os.path.join(data_home, 'graph')
graph_dir

'/lyceum/jhk1c21/msc_project/data/graph/hellop'

In [4]:
# load embeddings
title_embeddings = np.load(os.path.join(data_home, 'graph', 'title.npy'))
keywords_embeddings = np.load(os.path.join(data_home, 'graph', 'keywords.npy'))
abstract_embeddings = np.load(os.path.join(data_home, 'graph', 'abstract.npy'))

# load files
filtered_id = np.load(os.path.join(data_home, 'graph', 'id.npy'))
edges = np.load(os.path.join(data_home, 'graph', 'edges.npy'))
with open(os.path.join(data_home, 'graph', 'fos.npy'), 'rb') as f:
    domains = np.load(f, allow_pickle=True)

In [5]:
title_dict = { id: title for id, title in zip(filtered_id, title_embeddings) }
keywords_dict = { id: title for id, title in zip(filtered_id, keywords_embeddings) }
abstract_dict = { id: title for id, title in zip(filtered_id, abstract_embeddings) }

In [6]:
unique_node_ids = set([edge[0] for edge in edges] + [edge[1] for edge in edges])
unique_node_ids = list(unique_node_ids & set(filtered_id))
node_to_int = {node_id: i for i, node_id in enumerate(unique_node_ids)}

# Convert edges with string node IDs to integer node IDs
edges_int = []
for src, tgt in edges:
    src_id = node_to_int.get(src, -1)
    tgt_id = node_to_int.get(tgt, -1)

    if (src_id == -1) or (tgt_id == -1):
        continue
    else:
        edges_int.append( (node_to_int[src], node_to_int[tgt]) )


# htable_idx_to_id = { i: item for i, item in enumerate(filtered_id) }
# htable_id_to_idx = { item: i for i, item in enumerate(filtered_id) }

In [10]:
# Combine embeddings
node_features = np.concatenate([title_embeddings, keywords_embeddings, abstract_embeddings], axis=1)

# Convert edge information to a torch tensor
edge_index = torch.tensor(edges_int, dtype=torch.long).t().contiguous().to(device)

# Convert node features to torch tensor
x = torch.tensor(node_features, dtype=torch.float32).to(device)

data = Data(x=x, edge_index=edge_index)

RuntimeError: No CUDA GPUs are available

In [None]:
from torch_geometric.nn import GCNConv

class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        return x

In [None]:
# Create unique domain to integer mapping
all_domains = set(domain for sublist in domains for domain in sublist)
domain_to_int = {domain: i for i, domain in enumerate(all_domains)}

num_domains = len(all_domains)
num_papers = len(domains)

domain_matrix = np.zeros((num_papers, num_domains), dtype=np.float16)
for paper_idx, paper_domains in enumerate(domains):
    for domain in paper_domains:
        domain_matrix[paper_idx][domain_to_int[domain]] = 1

domain_matrix = torch.tensor(domain_matrix).to(device)

In [None]:
def domain_loss(embeddings):
    dot_product = torch.mm(embeddings, embeddings.t())
    shared_domains = torch.mm(domain_matrix, domain_matrix.t())
    different_domains = 1 - shared_domains
    dissimilarity = 1 - dot_product
    loss = (dissimilarity * different_domains).sum()  # Penalize similarity for different domains
    return loss

In [None]:
# Initialize the GNN model
in_channels = data.x.size(1)
model = GNN(in_channels=in_channels, hidden_channels=64, out_channels=in_channels)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(100):
    optimizer.zero_grad()
    out = model(data)
    loss = domain_loss(out)
    loss.backward()
    optimizer.step()
