In [15]:
import os

In [2]:
os.environ["DGLBACKEND"] = "pytorch"

import dgl
import dgl.data
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
import pandas as pd
import numpy as np

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [5]:
data_home = "/lyceum/jhk1c21/msc_project/data"
filtered_path = os.path.join(data_home, "graph", "filtered")

In [6]:
nodes = pd.read_csv(os.path.join(filtered_path, "filtered_nodes.csv"))
ids = np.load(os.path.join(filtered_path, "id.npy"))
edges = np.load(os.path.join(filtered_path, "eleminated_edges.npy"))

In [7]:
df = pd.DataFrame()
df['src'] = edges[:, 0]
df['des'] = edges[:, 1]

In [8]:
# convert id from str to numbers
id_to_int = {original_id: i for i, original_id in enumerate(ids)}
int_to_id = {i: original_id for original_id, i in id_to_int.items()}

df['src'] = df['src'].apply(lambda x: id_to_int[x])
df['des'] = df['des'].apply(lambda x: id_to_int[x])

In [11]:
title = np.load(os.path.join(filtered_path, "title.npy"))
keyword = np.load(os.path.join(filtered_path, "keywords.npy"))
abstract = np.load(os.path.join(filtered_path, "abstract.npy"))
domain = np.load(os.path.join(filtered_path, "domain_embedding.npy"))

In [12]:
len(domain)

148039

In [17]:
citation_network = dgl.DGLGraph( (df['src'], df['des']) )
# citation_network = dgl.graph( (df['src'], df['des']) )

citation_network.ndata['title'] = torch.tensor(title)
citation_network.ndata['keyword'] = torch.tensor(keyword)
citation_network.ndata['abstract'] = torch.tensor(abstract)
citation_network.ndata['domain'] = torch.tensor(domain)

In [18]:
citation_network

Graph(num_nodes=148039, num_edges=1412315,
      ndata_schemes={'title': Scheme(shape=(300,), dtype=torch.float32), 'keyword': Scheme(shape=(300,), dtype=torch.float32), 'abstract': Scheme(shape=(300,), dtype=torch.float32), 'domain': Scheme(shape=(300,), dtype=torch.float32)}
      edata_schemes={})

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GATConv

class GATLayer(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.gatconv = GATConv(in_dim, out_dim, num_heads=1)
        
    def forward(self, g, h):
        h = self.gatconv(g, h)
        return h.squeeze(1)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity for titles, abstracts, keywords, and domains
def compute_similarity(node1, node2):
    title_similarity = cosine_similarity([node1['title']], [node2['title']])[0][0]
    abstract_similarity = cosine_similarity([node1['abstract']], [node2['abstract']])[0][0]
    keyword_similarity = cosine_similarity([node1['keywords']], [node2['keywords']])[0][0]
    domain_dissimilarity = 1 - cosine_similarity([node1['domain']], [node2['domain']])[0][0]
    
    # Combine these based on your specific needs
    return title_similarity + abstract_similarity + keyword_similarity - domain_dissimilarity

# Initialize lists to hold pairs and labels
pairs = []
labels = []

# Loop over edges in the graph to create pairs and labels
for u, v in zip(edges[:, 0], edges[:, 1]):
    node1 = {'title': titles[u], 'abstract': abstracts[u], 'keywords': keywords[u], 'domain': domains[u]}
    node2 = {'title': titles[v], 'abstract': abstracts[v], 'keywords': keywords[v], 'domain': domains[v]}
    
    similarity = compute_similarity(node1, node2)
    
    # Based on a threshold, decide if the pair is similar or dissimilar
    if similarity > 0.5:  # This is a threshold you can tune
        labels.append(0)
    else:
        labels.append(1)
        
    pairs.append((u, v))

# Convert pairs and labels to tensors
pairs = torch.LongTensor(pairs)
labels = torch.FloatTensor(labels)


In [None]:
# Initialize the model and loss
model = GATModel(300, 128, 64)  # Assuming node feature size is 300
loss_fn = ContrastiveLoss()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(50):
    model.train()
    # Forward pass
    h = torch.FloatTensor(g.ndata['title'])  # Assuming titles are used as node features
    output = model(g, h)
    
    # Compute contrastive loss
    # Here you would prepare your 'output1', 'output2' and 'label' tensors based on your specific use-case
    loss = loss_fn(output1, output2, label)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch}, Loss: {loss.item()}')


In [None]:
class GATModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(GATModel, self).__init__()
        self.layer1 = GATLayer(in_dim, hidden_dim)
        self.layer2 = GATLayer(hidden_dim, out_dim)
        
    def forward(self, g, h):
        h = F.relu(self.layer1(g, h))
        h = self.layer2(g, h)
        return h


In [57]:
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F

from dgl.nn import GATConv

class GATModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads):
        super(GATModel, self).__init__()
        self.gat1 = GATConv(in_dim, hidden_dim, num_heads)
        self.gat2 = GATConv(hidden_dim * num_heads, out_dim, 1)
        
    def forward(self, g, h):
        h = self.gat1(g, h).flatten(1)
        h = F.elu(h)
        h = self.gat2(g, h).squeeze(1)
        return h

In [None]:
def custom_loss(edges, node_embeddings, domain_data):
    citing_indices = edges[0]
    cited_indices = edges[1]
    
    citing_embeddings = node_embeddings[citing_indices]
    cited_embeddings = node_embeddings[cited_indices]
    
    # Calculate similarity score using cosine similarity
    similarity_score = F.cosine_similarity(citing_embeddings, cited_embeddings, dim=1)
    
    # Calculate domain difference
    citing_domains = domain_data[citing_indices]
    cited_domains = domain_data[cited_indices]
    domain_difference = calculate_domain_difference(citing_domains, cited_domains)
    
    # Custom Loss: This is just an example; you might want to use a different form
    loss = -torch.sum(similarity_score / (1 + domain_difference))
    
    return loss

In [None]:
# Initialize the model and optimizer
model = GATModel(in_dim=128, hidden_dim=64, out_dim=32, num_heads=4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(100):
    model.train()
    
    # Forward pass
    node_embeddings = model(g, g.ndata['feature'])
    
    # Calculate loss
    edges = g.edges()
    loss = custom_loss(edges, node_embeddings, domain_data)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch}, Loss: {loss.item()}')


# [Loss function] Contrastive learning

In [None]:
from torch.nn import TripletMarginLoss

# Initialize the loss function
triplet_loss = TripletMarginLoss(margin=1.0, p=2)

# Forward pass through GAT to get embeddings
anchor_embeddings = gat_model(graph, anchor_features)
positive_embeddings = gat_model(graph, positive_features)
negative_embeddings = gat_model(graph, negative_features)

# Compute the loss
loss = triplet_loss(anchor_embeddings, positive_embeddings, negative_embeddings)


In [None]:
import torch
import dgl