In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import dgl
from dgl.nn import GATConv

In [4]:
import os
import pandas as pd
import numpy as np

In [5]:
import os
import pandas as pd
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# if device == "cpu":
#     print("device CPU")
#     exit(0)

In [6]:
DATA_HOME = "/lyceum/jhk1c21/msc_project/data"
V14_PATH = os.path.join(DATA_HOME, "graph", "v14")
FILTERED_PATH = os.path.join(V14_PATH, "filtered")

In [7]:
# Load the data
nodes = pd.read_csv(os.path.join(V14_PATH, "nodes_v14.csv"), index_col='id')
similarity = pd.read_csv(os.path.join(FILTERED_PATH, "similarity_edges.csv"))

titles = np.load(os.path.join(FILTERED_PATH, 'title_embedding.npy'))
abstracts = np.load(os.path.join(FILTERED_PATH, 'abstract_embedding.npy'))
keywords = np.load(os.path.join(FILTERED_PATH, 'keywords_embedding.npy'))
domains = np.load(os.path.join(FILTERED_PATH, 'domains_embedding.npy'))

ids = np.load(os.path.join(FILTERED_PATH, "filtered_id.npy"))
edges = np.load(os.path.join(FILTERED_PATH, 'filtered_edge.npy'))

In [8]:
df = pd.DataFrame()
df['src'] = edges[:, 0]
df['dst'] = edges[:, 1]

# convert id from str to numbers
id_to_int = {original_id: i for i, original_id in enumerate(ids)}
int_to_id = {i: original_id for original_id, i in id_to_int.items()}

df['src'] = df['src'].apply(lambda x: id_to_int[x])
df['dst'] = df['dst'].apply(lambda x: id_to_int[x])

In [64]:
# Compute similarity for titles, abstracts, keywords, and domains
def compute_each_similarity(node1, node2):
    title_similarity = cosine_similarity([node1['title']], [node2['title']])[0][0]
    abstract_similarity = cosine_similarity([node1['abstract']], [node2['abstract']])[0][0]
    keyword_similarity = cosine_similarity([node1['keywords']], [node2['keywords']])[0][0]
    domain_dissimilarity = 1 - cosine_similarity([node1['domain']], [node2['domain']])[0][0]

    return title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity

def compute_similarity(node1, node2):
    title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity = compute_each_similarity(node1, node2)
    
    return title_similarity + abstract_similarity + keyword_similarity + domain_dissimilarity

# Compute similarity for titles, abstracts, keywords, and domains
def compute_weighted_similarity(node1, node2):
    title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity = compute_each_similarity(node1, node2)
    w1, w2, w3, w4 = 0.25, 0.15, 0.2, 0.4

    return w1*title_similarity + w2*abstract_similarity + w3*keyword_similarity + w4*domain_dissimilarity

# Compute similarity for titles, abstracts, keywords, and domains
def compute_df_similarity(sim_df, weight=None):
    if weight is None:
        w1, w2, w3, w4 = 0.25, 0.25, 0.25, 0.25
    else:
        w1, w2, w3, w4 = weight

    return w1*sim_df['title'] + w2*sim_df['abstract'] + w3*sim_df['keyword'] + w4*(1-sim_df['domain'])

In [65]:
def create_pairs(node1, node2):
    title_similarity = F.cosine_similarity([node1['title']], [node2['title']])[0][0]
    abstract_similarity = F.cosine_similarity([node1['abstract']], [node2['abstract']])[0][0]
    keyword_similarity = F.cosine_similarity([node1['keywords']], [node2['keywords']])[0][0]
    domain_dissimilarity = 1 - F.cosine_similarity([node1['domain']], [node2['domain']])[0][0]
    
    return title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity

In [66]:
similarity['weighted_similarity'] = compute_df_similarity(similarity, (0.25, 0.15, 0.2, 0.4))
similarity['similarity'] = compute_df_similarity(similarity)

similarity_list = list(similarity['similarity'])

In [67]:
similarity

Unnamed: 0,src,dst,title,abstract,keyword,domain,weighted_similarity,similarity
0,53e99beab7602d9702497a80,53e9a4c0b7602d9702ddf482,0.470632,0.124602,0.634944,0.445128,0.485286,0.446262
1,53e9a1d5b7602d9702ad2aa6,558aec6284ae84d265c0707c,0.363818,0.738057,0.825033,0.803241,0.445373,0.530917
2,53e9abc9b7602d970357a86b,557d23366feeaa8086da70ff,0.549633,0.203248,0.594433,0.879736,0.334888,0.366895
3,53e9b708b7602d970429d764,53e9b5d4b7602d97041251f3,0.600002,0.239508,0.842274,0.688702,0.478901,0.498270
4,53e9bc1bb7602d9704883a9c,53e9ba39b7602d9704648483,0.568734,0.682352,0.832233,0.852411,0.470019,0.557727
...,...,...,...,...,...,...,...,...
1273170,53e9ad47b7602d970372c2bd,53e9ab6fb7602d970350e269,0.684507,0.129349,0.513802,0.881588,0.340654,0.361518
1273171,53e9abf1b7602d97035afe55,53e9aa79b7602d97033ef136,0.524307,0.402008,0.199441,0.722187,0.342391,0.350892
1273172,5a260c2e17c44a4ba8a24152,53e99b31b7602d97023ce813,0.499875,0.309166,0.822491,0.854233,0.394148,0.444325
1273173,599c77fa601a182cd2590dbc,53e9b903b7602d97044e594e,0.525179,0.415119,0.722028,0.705364,0.455823,0.489240


In [9]:
tensor_title = torch.FloatTensor(titles)
tensor_abstract = torch.FloatTensor(abstracts)
tensor_keywords = torch.FloatTensor(keywords)
tensor_domain = torch.FloatTensor(domains)

node_features = np.concatenate([tensor_title, tensor_abstract, tensor_keywords, tensor_domain], axis=1)

In [10]:
print(tensor_domain.view(tensor_domain.shape[0], -1))
print(tensor_domain.shape)

tensor([[-0.0068,  0.0102,  0.0072,  ...,  0.0114,  0.0097,  0.0002],
        [-0.0193,  0.0117, -0.0003,  ...,  0.0161,  0.0130, -0.0064],
        [-0.0158,  0.0258,  0.0063,  ...,  0.0371, -0.0089, -0.0235],
        ...,
        [-0.0039,  0.0172,  0.0035,  ...,  0.0067,  0.0091,  0.0054],
        [-0.0116,  0.0055,  0.0058,  ...,  0.0183,  0.0149, -0.0040],
        [-0.0041,  0.0014,  0.0039,  ...,  0.0138,  0.0155, -0.0062]])
torch.Size([162207, 300])


In [11]:
# Create a DGL graph
citation_network = dgl.graph( (df['src'], df['dst']) )

citation_network.ndata['features'] = torch.FloatTensor(node_features)
# citation_network.ndata['title'] = torch.FloatTensor(titles)
# citation_network.ndata['abstract'] = torch.FloatTensor(abstracts)
# citation_network.ndata['keywords'] = torch.FloatTensor(keywords)
# citation_network.ndata['domain'] = torch.FloatTensor(domains)

In [None]:
citation_network.edata['weight'] = torch.FloatTensor(similarity_list)

In [None]:
citation_network

Graph(num_nodes=162207, num_edges=1273175,
      ndata_schemes={'title': Scheme(shape=(300,), dtype=torch.float32), 'abstract': Scheme(shape=(300,), dtype=torch.float32), 'keywords': Scheme(shape=(300,), dtype=torch.float32), 'domain': Scheme(shape=(300,), dtype=torch.float32)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float32)})

In [None]:
# MODEL STARTS
# GAT Layer
class GATLayer(nn.Module):
    def __init__(self, in_dim, out_dim, num_heads=1):
        super(GATLayer, self).__init__()
        self.gatconv = GATConv(in_dim, out_dim, num_heads, allow_zero_in_degree=True)
        
    def forward(self, g, h):
        h = self.gatconv(g, h)
        return h.squeeze(1)

# GAT Model
class GATModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads):
        super(GATModel, self).__init__()
        self.layer1 = GATLayer(in_dim, hidden_dim, num_heads)
        self.layer2 = GATLayer(hidden_dim, out_dim)
        
    def forward(self, g, h):
        h = F.relu(self.layer1(g, h))
        h = self.layer2(g, h)
        return h
    
# Contrastive Loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

In [None]:
labels = np.zeros(similarity.shape[0])
labels[list(similarity[similarity['similarity'] <= 0.5].index)] = 1

In [None]:
pairs = list(zip(df['src'], df['dst']))

In [None]:
pairs = torch.LongTensor(pairs)
labels = torch.FloatTensor(labels)

ValueError: too many dimensions 'str'

In [None]:
labels = np.zeros(similarity.shape[0])
labels[list(similarity[similarity['similarity'] <= 0.5].index)] = 1
pairs = list(zip(df['src'], df['dst']))

# Convert pairs and labels to tensors
pairs = torch.LongTensor(pairs)
labels = torch.FloatTensor(labels)

torch.save(pairs, os.path.join(V14_PATH, "result", "pairs.pt"))
torch.save(labels, os.path.join(V14_PATH, "result", "labels.pt"))

In [None]:
# pairs = torch.load(os.path.join(V14_PATH, "result", "pairs.pt"))
# labels = torch.load(os.path.join(V14_PATH, "result", "labels.pt"))

# Initialize the model and loss
# INPUT: (Feature Dim, Hidden Dim, Output Dim)
model = GATModel(300, 128, 64)
loss_fn = ContrastiveLoss()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(50):
    model.train()
    
    # Forward pass
    h = torch.FloatTensor(citation_network.ndata['title'])
    output = model(citation_network, h)
    
    # Create output1 and output2 based on pairs
    output1 = output[pairs[:, 0]]
    output2 = output[pairs[:, 1]]
    
    # Compute contrastive loss
    loss = loss_fn(output1, output2, labels)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 0.4656655192375183
Epoch 1, Loss: 0.4526820778846741
Epoch 2, Loss: 0.43969765305519104
Epoch 3, Loss: 0.4270704686641693
Epoch 4, Loss: 0.41504502296447754
Epoch 5, Loss: 0.4038435220718384
Epoch 6, Loss: 0.39373674988746643
Epoch 7, Loss: 0.3849829137325287
Epoch 8, Loss: 0.37778109312057495
Epoch 9, Loss: 0.37221482396125793
Epoch 10, Loss: 0.36811766028404236
Epoch 11, Loss: 0.36504703760147095
Epoch 12, Loss: 0.36251315474510193
Epoch 13, Loss: 0.360110878944397
Epoch 14, Loss: 0.3575690984725952
Epoch 15, Loss: 0.3547358214855194
Epoch 16, Loss: 0.35156479477882385
Epoch 17, Loss: 0.3480900228023529
Epoch 18, Loss: 0.3443974256515503
Epoch 19, Loss: 0.34060609340667725
Epoch 20, Loss: 0.3368392884731293
Epoch 21, Loss: 0.3332182466983795
Epoch 22, Loss: 0.32984036207199097
Epoch 23, Loss: 0.3267745077610016
Epoch 24, Loss: 0.32406291365623474
Epoch 25, Loss: 0.3217199444770813
Epoch 26, Loss: 0.3197348415851593
Epoch 27, Loss: 0.31809258460998535
Epoch 28, Loss: 0.

KeyboardInterrupt: 

In [None]:
print("TEST START")

model.eval()


In [None]:
similarity[['weighted_similarity', 'similarity']].describe()

Unnamed: 0,weighted_similarity,similarity
count,1273175.0,1273175.0
mean,0.4407042,0.4759605
std,0.07233094,0.08558922
min,0.1165398,0.1015561
25%,0.3916829,0.417896
50%,0.4395954,0.4736374
75%,0.4883887,0.5318742
max,0.7972142,0.8458895


In [12]:
tt, aa, kk, dd = [], [], [], []
for i in range(100):
    for j in range(i, 1000):
        node1 = {'title': titles[i], 'abstract': abstracts[i], 'keywords': keywords[i] ,'domain': domains[i]}
        node2 = {'title': titles[j], 'abstract': abstracts[j], 'keywords': keywords[j] ,'domain': domains[j]}
        t, a, k, d = compute_each_similarity(node1, node2)
        tt.append(t)
        aa.append(a)
        kk.append(k)
        dd.append(d)
        print(t, a, k, d)


df = pd.DataFrame()
df['title'] = tt
df['abstract'] = aa
df['keyword'] = kk
df['domain'] = dd

df

NameError: name 'compute_each_similarity' is not defined

In [13]:
similarity

Unnamed: 0,src,dst,title,abstract,keyword,domain
0,53e99beab7602d9702497a80,53e9a4c0b7602d9702ddf482,0.470632,0.124602,0.634944,0.445128
1,53e9a1d5b7602d9702ad2aa6,558aec6284ae84d265c0707c,0.363818,0.738057,0.825033,0.803241
2,53e9abc9b7602d970357a86b,557d23366feeaa8086da70ff,0.549633,0.203248,0.594433,0.879736
3,53e9b708b7602d970429d764,53e9b5d4b7602d97041251f3,0.600002,0.239508,0.842274,0.688702
4,53e9bc1bb7602d9704883a9c,53e9ba39b7602d9704648483,0.568734,0.682352,0.832233,0.852411
...,...,...,...,...,...,...
1273170,53e9ad47b7602d970372c2bd,53e9ab6fb7602d970350e269,0.684507,0.129349,0.513802,0.881588
1273171,53e9abf1b7602d97035afe55,53e9aa79b7602d97033ef136,0.524307,0.402008,0.199441,0.722187
1273172,5a260c2e17c44a4ba8a24152,53e99b31b7602d97023ce813,0.499875,0.309166,0.822491,0.854233
1273173,599c77fa601a182cd2590dbc,53e9b903b7602d97044e594e,0.525179,0.415119,0.722028,0.705364


In [None]:
cache = {}
def compute_similarities():
    for i in range(1000):
        for j in range(i, 1000):
            key = (i, j)
            if key in cache:
                yield key, cache[key]
            else:
                node1 = {'title': titles[i], 'abstract': abstracts[i], 'keywords': keywords[i] ,'domain': domains[i]}
                node2 = {'title': titles[j], 'abstract': abstracts[j], 'keywords': keywords[j] ,'domain': domains[j]}
                t, a, k, d = compute_each_similarity(node1, node2)
                cache[key] = (t, a, k, d)
                yield key, cache[key]
similarities = compute_similarities()
df = pd.DataFrame(similarities)
df

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame()
df['titles'] = cosine_similarity(titles, titles)[0]
df['domains'] = cosine_similarity(domains, domains)[0]
df['keywords'] = cosine_similarity(keywords, keywords)[0]
df['abstracts'] = cosine_similarity(abstracts, abstracts)[0]
df.desciption()

In [None]:
df.describe()

NameError: name 'df' is not defined

In [None]:
tt = torch.FloatTensor(titles)
at = torch.FloatTensor(abstracts[:100])

res = F.cosine_similarity(tt, tt, dim=1)

In [11]:
tt = torch.FloatTensor(titles[:10000])
res = F.cosine_similarity(tt.unsqueeze(0), tt.unsqueeze(1), dim=2)

: 

: 

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

res = cosine_similarity(titles, titles)
res

In [None]:
res.shape

(100, 100)

In [1]:
n_nodes = citation_network.number_of_nodes()
random_pair = torch.randint(0, n_nodes, (100000, 2))

src = random_pair[:, 0].numpy()
dst = random_pair[:, 1].numpy()

dfs = df.set_index(['src', 'dst'])
random_pair_one = dfs[dfs.index.isin(list(zip(src, dst)))].reset_index().to_numpy()

NameError: name 'citation_network' is not defined

In [84]:
def similarity_score(random_pair):
    titles_similarity = cosine_similarity(titles[random_pair[:,0]], titles[random_pair[:,1]])
    abstracts_similarity = cosine_similarity(abstracts[random_pair[:,0]], abstracts[random_pair[:,1]])
    keywords_similarity = cosine_similarity(keywords[random_pair[:,0]], keywords[random_pair[:,1]])
    domains_similarity = cosine_similarity(domains[random_pair[:,0]], domains[random_pair[:,1]])
    
    return titles_similarity, abstracts_similarity, keywords_similarity, domains_similarity

In [85]:
res = similarity_score(random_pair)

In [None]:
res[0]