In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import dgl
from dgl.nn import GATv2Conv

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
DATA_HOME = "/lyceum/jhk1c21/msc_project/data"
V14_PATH = os.path.join(DATA_HOME, "graph", "v14")
FILTERED_PATH = os.path.join(V14_PATH, "filtered")

In [5]:
# Load the data
nodes = pd.read_csv(os.path.join(V14_PATH, "nodes_v14.csv"), index_col='id')
similarity = pd.read_csv(os.path.join(FILTERED_PATH, "similarity_edges.csv"))

titles = np.load(os.path.join(FILTERED_PATH, 'title_embedding.npy'))
abstracts = np.load(os.path.join(FILTERED_PATH, 'abstract_embedding.npy'))
keywords = np.load(os.path.join(FILTERED_PATH, 'keywords_embedding.npy'))
domains = np.load(os.path.join(FILTERED_PATH, 'domains_embedding.npy'))

ids = np.load(os.path.join(FILTERED_PATH, "filtered_id.npy"))
edges = np.load(os.path.join(FILTERED_PATH, 'filtered_edge.npy'))

In [6]:
df = pd.DataFrame()
df['src'] = edges[:, 0]
df['dst'] = edges[:, 1]

# convert id from str to numbers
id_to_int = {original_id: i for i, original_id in enumerate(ids)}
int_to_id = {i: original_id for original_id, i in id_to_int.items()}

df['src'] = df['src'].apply(lambda x: id_to_int[x])
df['dst'] = df['dst'].apply(lambda x: id_to_int[x])

In [7]:
tensor_title = torch.FloatTensor(titles)
tensor_abstract = torch.FloatTensor(abstracts)
tensor_keywords = torch.FloatTensor(keywords)
tensor_domain = torch.FloatTensor(domains)

node_features = np.concatenate([titles, abstracts, keywords, domains], axis=1)
tensor_node_features = torch.FloatTensor(node_features)

In [8]:
citation_network = dgl.graph( (df['src'], df['dst']) )
citation_network.ndata['features'] = tensor_node_features

In [12]:
def similarity_score(pair, linked_pair, features):
    w1, w2, w3, w4, w5 = 0.25, 0.2, 0.2, 0.35, 0.1
    
    titles, abstracts, keywords, domains = features[:, :300], features[:, 300:600], features[:, 600:900], features[:, 900:1200]
    titles_similarity = F.cosine_similarity(titles[pair[:,0]], titles[pair[:,1]])
    abstracts_similarity = F.cosine_similarity(abstracts[pair[:,0]], abstracts[pair[:,1]])
    keywords_similarity = F.cosine_similarity(keywords[pair[:,0]], keywords[pair[:,1]])
    domains_dissimilarity = 1 - F.cosine_similarity(domains[pair[:,0]], domains[pair[:,1]])
    
    if linked_pair.shape[0] == 0:
        weighted_link_similarity = torch.zeros((pair.shape[0],), dtype=torch.float32)
    else:
        overlap_mask = (linked_pair[:, None, :] == pair[None, :, :]).all(dim=2)
        overlap_mask_1d = overlap_mask.any(dim=0)
        weighted_link_similarity = overlap_mask_1d.float()

    return w1*titles_similarity + w2*abstracts_similarity + w3*keywords_similarity + w4*domains_dissimilarity + w5*weighted_link_similarity


In [10]:
def generate_positive_negative_pairs(graph, node_features, high_threshold = 0.6, low_threshold = 0.4, n_samples=10_000):

    n_nodes = graph.number_of_nodes()
    random_pair = torch.randint(0, n_nodes, (n_samples, 2))

    src = random_pair[:, 0].numpy()
    dst = random_pair[:, 1].numpy()

    dfs = df.set_index(['src', 'dst'])
    linked_random_pair = dfs[dfs.index.isin(list(zip(src, dst)))].reset_index()[['src', 'dst']].to_numpy()
    linked_random_pair = torch.FloatTensor(linked_random_pair)
    
    scores = similarity_score(random_pair, linked_random_pair, node_features)
    positive_pairs = random_pair[scores > high_threshold]
    negative_pairs = random_pair[scores < low_threshold]
    
    return positive_pairs, negative_pairs

In [11]:
class GAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.layer1 = GATv2Conv(in_dim, hidden_dim, num_heads=num_heads, allow_zero_in_degree=True)
        self.layer2 = GATv2Conv(hidden_dim * num_heads, out_dim, num_heads=1, allow_zero_in_degree=True)

    def forward(self, graph, features):
        # h = self.layer1(graph, features).view(h.size(0), -1)
        h = self.layer1(graph, features).flatten(1)
        h = F.elu(h)
        h = self.layer2(graph, h).squeeze(1)
        return h

In [12]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    # as output dimension is different
    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2, keepdim=True)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

In [13]:
model = GAT(in_dim=1200, hidden_dim=100, out_dim=50, num_heads=4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [295]:
for epoch in range(5):  # Adjust the number of epochs
    model.train() # train starts
    
    optimizer.zero_grad()
    criterion = ContrastiveLoss()
    
    output = model(citation_network, citation_network.ndata['features'])
    positive_pairs, negative_pairs = generate_positive_negative_pairs(citation_network, tensor_node_features, n_samples=10000)
    
    loss = 0
    for pair in positive_pairs:
        output1, output2 = output[pair[0]], output[pair[1]]
        label = torch.Tensor([0])
        loss += criterion(output1.unsqueeze(0), output2.unsqueeze(0), label)
    
    for pair in negative_pairs:
        output1, output2 = output[pair[0]], output[pair[1]]
        label = torch.Tensor([1])
        loss += criterion(output1.unsqueeze(0), output2.unsqueeze(0), label)
    
    loss /= (len(positive_pairs) + len(negative_pairs))
    
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/100], Loss: {loss.item()}')


torch.Size([162207, 50])


: 

In [24]:
for i in range(5):
    positive_pairs, negative_pairs = generate_positive_negative_pairs(citation_network, tensor_node_features, n_samples=10000)

torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000])
torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000])
torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000])
torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000])
torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000]) torch.Size([10000])


In [48]:
graph = citation_network
n_samples = 10000
high_threshold = 0.5
low_threshold = 0.3

means, min, max = [], [], []
num_5, num_55, num_3, num_25 = [], [], [], []
for _ in range(10):
    n_nodes = graph.number_of_nodes()
    random_pair = torch.randint(0, n_nodes, (n_samples, 2))

    src = random_pair[:, 0].numpy()
    dst = random_pair[:, 1].numpy()

    dfs = df.set_index(['src', 'dst'])
    linked_random_pair = dfs[dfs.index.isin(list(zip(src, dst)))].reset_index()[['src', 'dst']].to_numpy()
    linked_random_pair = torch.FloatTensor(linked_random_pair)

    # linked_random_pair = torch.FloatTensor([[1,2], [2,3], [2,3]])
    # print(linked_random_pair)

    scores = similarity_score(random_pair, linked_random_pair, tensor_node_features)
    positive_pairs = random_pair[scores > high_threshold]
    negative_pairs = random_pair[scores < low_threshold]
    des = pd.DataFrame(scores).describe()
    means.append(des.loc['mean'][0])
    min.append(des.loc['min'][0])
    max.append(des.loc['max'][0])
    
    num_5.append(random_pair[scores > 0.5].shape[0])
    num_55.append(random_pair[scores > 0.55].shape[0])
    num_3.append(random_pair[scores < 0.4].shape[0])
    num_25.append(random_pair[scores < 0.35].shape[0])
    
print(np.mean(means), np.mean(min), np.mean(max))
print(np.mean(num_5), np.mean(num_55), np.mean(num_3), np.mean(num_25))

0.41740751564502715 0.18778504878282548 0.6931466341018677
1049.5 231.7 3971.4 1540.3


In [50]:
des

Unnamed: 0,0
count,10000.0
mean,0.416557
std,0.065432
min,0.192632
25%,0.372048
50%,0.416801
75%,0.461001
max,0.689285


In [15]:
positive_pairs, negative_pairs

(tensor([[ 5580, 78617]]),
 tensor([[ 80614,  75933],
         [ 66861, 116188],
         [ 72866,  53177]]))

In [66]:
linked_random_pair = torch.FloatTensor([[71616, 131606], [85942, 158828], [1,2], [2,3], [2,3]])
# linked_random_pair = torch.FloatTensor([[1,2]])

domains = tensor_node_features[:, 900:1200]
domains_dissimilarity = 1 - F.cosine_similarity(domains[random_pair[:,0]], domains[random_pair[:,1]])

overlap_mask = (linked_random_pair[:, None, :] == random_pair[None, :, :]).all(dim=2)
overlap_mask_1d = overlap_mask.any(dim=0)
weighted_link_similarity = overlap_mask_1d.float()

print(overlap_mask.shape)
print(overlap_mask_1d)
print(weighted_link_similarity.shape)

print(domains_dissimilarity + weighted_link_similarity)
print(domains_dissimilarity)

torch.Size([5, 10000])
tensor([ True,  True, False,  ..., False, False, False])
torch.Size([10000])
tensor([1.2752, 1.2189, 0.4533,  ..., 0.2656, 0.5886, 0.3431])
tensor([0.2752, 0.2189, 0.4533,  ..., 0.2656, 0.5886, 0.3431])


In [62]:
overlap_mask = (linked_random_pair[:, None, :] == random_pair[None, :, :]).all(dim=2)
overlap_mask.any(dim=0)

tensor([False, False, False,  ..., False, False, False])

In [55]:
overlap_mask = (linked_random_pair[:, None, :] == random_pair[None, :, :]).all(dim=-1).shape

torch.Size([3, 10000])

In [64]:
random_pair

tensor([[ 71616, 131606],
        [ 85942, 158828],
        [ 94244, 121809],
        ...,
        [ 87891, 153233],
        [ 30173, 111150],
        [131268, 154469]])

In [12]:
edges.shape

(1273175, 2)