In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import dgl
from dgl.nn import GATConv

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# if device == "cpu":
#     print("device CPU")
#     exit(0)

In [4]:
DATA_HOME = "/lyceum/jhk1c21/msc_project/data"
V14_PATH = os.path.join(DATA_HOME, "graph", "v14")
FILTERED_PATH = os.path.join(V14_PATH, "filtered")

In [6]:
nodes = pd.read_csv(os.path.join(V14_PATH, "nodes_v14.csv"), index_col='id')

In [7]:
# Load the data
titles = np.load(os.path.join(FILTERED_PATH, 'title_embedding.npy'))
abstracts = np.load(os.path.join(FILTERED_PATH, 'abstract_embedding.npy'))
keywords = np.load(os.path.join(FILTERED_PATH, 'keywords_embedding.npy'))
domains = np.load(os.path.join(FILTERED_PATH, 'domains_embedding.npy'))

ids = np.load(os.path.join(FILTERED_PATH, "filtered_id.npy"))
edges = np.load(os.path.join(FILTERED_PATH, 'filtered_edge.npy'))

In [8]:
df = pd.DataFrame()
df['src'] = edges[:, 0]
df['dst'] = edges[:, 1]

# convert id from str to numbers
id_to_int = {original_id: i for i, original_id in enumerate(ids)}
int_to_id = {i: original_id for original_id, i in id_to_int.items()}

df['src'] = df['src'].apply(lambda x: id_to_int[x])
df['dst'] = df['dst'].apply(lambda x: id_to_int[x])

In [28]:
# Compute similarity for titles, abstracts, keywords, and domains
def compute_similarity(node1, node2):
    title_similarity = cosine_similarity([node1['title']], [node2['title']])[0][0]
    abstract_similarity = cosine_similarity([node1['abstract']], [node2['abstract']])[0][0]
    keyword_similarity = cosine_similarity([node1['keywords']], [node2['keywords']])[0][0]
    domain_dissimilarity = 1 - cosine_similarity([node1['domain']], [node2['domain']])[0][0]

    return title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity


# Compute similarity for titles, abstracts, keywords, and domains
def compute_hetro_similarity(node1, node2):
    title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity = compute_similarity(node1, node2)
    w1, w2, w3, w4 = 0.25, 0.15, 0.2, 0.4

    # return title_similarity + abstract_similarity + keyword_similarity - domain_dissimilarity
    return w1*title_similarity + w2*abstract_similarity + w3*keyword_similarity - w4*domain_dissimilarity


# Compute similarity for titles, abstracts, keywords
def compute_homo_similarity(node1, node2):
    title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity = compute_similarity(node1, node2)
    w1, w2, w3 = 0.4, 0.3, 0.35

    return w1*title_similarity + w2*abstract_similarity + w3*keyword_similarity

def normalise(similarity, mean, std):
    normalised_title_similarity = (similarity['title'] - mean.loc['title']['mean']) / std.loc['title']['std']
    normalised_abstract_similarity = (similarity['abstract'] - mean.loc['abstract']['mean']) / std.loc['abstract']['std']
    normalised_keyword_similarity = (similarity['keyword'] - mean.loc['keyword']['mean']) / std.loc['keyword']['std']
    normalised_domain_similarity = (1-similarity['domain'] - mean.loc['domain']['mean']) / std.loc['domain']['std']
    
    return normalised_title_similarity, normalised_abstract_similarity, normalised_keyword_similarity, normalised_domain_similarity

In [36]:
# Initialize lists to hold pairs and labels
pairs = []
labels = []
weights = []

# Loop over edges in the graph to create pairs and labels
src_list, dst_list = list(df['src']), list(df['dst'])
for idx, (src, dst) in enumerate(zip(src_list, dst_list)):
    node1 = {'title': titles[src], 'abstract': abstracts[src], 'keywords': keywords[src], 'domain': domains[src]}
    node2 = {'title': titles[dst], 'abstract': abstracts[dst], 'keywords': keywords[dst], 'domain': domains[dst]}

    title_similarity, abstract_similarity, keyword_similarity, domain_dissimilarity = compute_similarity(node1, node2)
    hetro_similarity = compute_hetro_similarity(node1, node2)
    homo_similarity = compute_homo_similarity(node1, node2)

    # if similarity > 0.5:
    #     labels.append(0)
    # else:
    #     labels.append(1)

    # pairs.append((src, dst))
    # weights.append(similarity)
    # print(f"{nodes.loc[src]['title']}-{nodes.loc[dst]['title']}: {similarity}")
    
    if idx % 10000 == 0:
        print(idx)
    
    if domain_dissimilarity > 0.2:
        print(f"HETRO: [{hetro_similarity}]")
        print(f"HOMO: [{homo_similarity}]")
        print(f"[{title_similarity}, {abstract_similarity}, {keyword_similarity}, {1-domain_dissimilarity}]")
        print(f"SRC: {nodes.loc[int_to_id[src]]['fos']}\nDST: {nodes.loc[int_to_id[dst]]['fos']}\n")
        print(f"SRC: {nodes.loc[int_to_id[src]]['title']}\nDST: {nodes.loc[int_to_id[dst]]['title']}\n")

# Convert pairs and labels to tensors
pairs = torch.LongTensor(pairs)
labels = torch.FloatTensor(labels)
weights = torch.FloatTensor(weights)

0


10000
20000
30000


KeyboardInterrupt: 

Unnamed: 0,std
,std
title,0.13983675837516785
abstract,0.19557268917560577
keyword,0.17731449007987976
domain,0.0038095359192745616


In [29]:
mean_for_normalise = pd.read_csv(os.path.join(FILTERED_PATH, "mean.csv"), index_col=0)
std_for_normalise = pd.read_csv(os.path.join(FILTERED_PATH, "std.csv"), index_col=0)

# normalise only in terms of edges
src_list, dst_list = list(df['src']), list(df['dst'])
for idx, (src, dst) in enumerate(zip(src_list, dst_list)):
    node1 = {'title': titles[src], 'abstract': abstracts[src], 'keywords': keywords[src], 'domain': domains[src]}
    node2 = {'title': titles[dst], 'abstract': abstracts[dst], 'keywords': keywords[dst], 'domain': domains[dst]}
    similarity = {}
    similarity['title'], similarity['abstract'], similarity['keyword'], similarity['domain'] = compute_similarity(node1, node2)
    # domain_similarity = 1- domain_dissimilarity
    
    if idx % 10000 == 0:
        print(idx)

    
    normalised_title_similarity, normalised_abstract_similarity, normalised_keyword_similarity, normalised_domain_similarity = normalise(similarity, mean_for_normalise, std_for_normalise)
    
    
    print(f"[title] {normalised_title_similarity}")
    print(f"[abstract] {normalised_abstract_similarity}")
    print(f"[keyword] {normalised_keyword_similarity}")
    print(f"[domain] {normalised_domain_similarity}")

0
[title] 0.27802861080747127
[abstract] -1.1899947739602574
[keyword] -1.5645928937102074
[domain] 0.6681886071528401
[title] 0.8244791345873307
[abstract] -1.5190685687043657
[keyword] -2.618643972363589
[domain] -0.852462608238101
[title] -0.6346332581935379
[abstract] -1.0074776026596044
[keyword] 0.031240298859419
[domain] 0.5237275239991765
[title] -0.9079128662532241
[abstract] 0.30495172408643556
[keyword] 0.8951875955827595
[domain] 0.9234871551673888
[title] 0.318337697721489
[abstract] -0.6608990510307279
[keyword] -0.44569470629295305
[domain] -0.6067082236291339
[title] -0.1521454056280894
[abstract] -0.7379277834299875
[keyword] 0.8880248658458842
[domain] -0.6699343946487326
[title] -0.6224861433251291
[abstract] 0.8206340719962218
[keyword] 0.9327418400329297
[domain] 0.20440485031964456
[title] -0.9210418352552345
[abstract] 1.5502648030299608
[keyword] 1.1088116063920677
[domain] 1.2398996413166943
[title] -0.6131347659267846
[abstract] 0.6396498926715138
[keyword] 0.

KeyboardInterrupt: 

In [36]:
src = 6727
dst_list = list(map(int, citation_network.out_edges(src)[1]))
# dst_list = [ 10012, 12323, 3888]

print(f"from node 0: [{dst_list}]")

for dst in dst_list:
    node1 = {'title': titles[src], 'abstract': abstracts[src], 'keywords': keywords[src], 'domain': domains[src]}
    node2 = {'title': titles[dst], 'abstract': abstracts[dst], 'keywords': keywords[dst], 'domain': domains[dst]}

    similarity = {}
    similarity['title'], similarity['abstract'], similarity['keyword'], similarity['domain'] = compute_similarity(node1, node2)
    hetro_similarity = compute_hetro_similarity(node1, node2)
    homo_similarity = compute_homo_similarity(node1, node2)
    
    normalised_title_similarity, normalised_abstract_similarity, normalised_keyword_similarity, normalised_domain_similarity = normalise(similarity, mean_for_normalise, std_for_normalise)
    
    print(f"HOMO: [{homo_similarity}]")
    print(f"HETRO: [{hetro_similarity}]")
    print(f"[{similarity['title']}, {similarity['abstract']}, {similarity['keyword']}, {similarity['domain']}]")
    print(f"[{normalised_title_similarity}, {normalised_abstract_similarity}, {normalised_keyword_similarity}, {-normalised_domain_similarity}]")
    print(f"SRC: {nodes.loc[int_to_id[src]]['title']}\nDST: {nodes.loc[int_to_id[dst]]['title']}")
    print(f"SRC: {nodes.loc[int_to_id[src]]['keywords']}\nDST: {nodes.loc[int_to_id[dst]]['keywords']}")
    print(f"SRC: {nodes.loc[int_to_id[src]]['fos']}\nDST: {nodes.loc[int_to_id[dst]]['fos']}\n")


from node 0: [[131321, 17561, 92871, 62313, 16087, 57746, 129680, 142120, 131815, 142170]]
HOMO: [0.7681584894657135]
HETRO: [0.4387723237276077]
[0.676799476146698, 0.5793207883834839, 0.9246927499771118, 0.005660533905029297]
[0.7434858656268578, 0.27657491496351805, 1.4307861909374398, 0.06546028151555407]
SRC: Graph partitioning models for parallel computing
DST: Multilevel Algorithms for Multi-Constraint Graph Partitioning
SRC: ['parallel computing', 'hypergraph partitioning', 'graph partitioning', 'parallel computer', 'parallel processing', 'mathematical model']
DST: ['Graph partitioning', 'numerical simulations', 'parallel processing', 'Graph partitioning', 'numerical simulations', 'parallel processing']
SRC: ['Discrete mathematics', 'Strength of a graph', 'Graph power', 'Computer science', 'Graph labeling', 'Quartic graph', 'Parallel computing', 'Theoretical computer science', 'Null graph', 'Graph partition', 'Graph (abstract data type)', 'Voltage graph']
DST: ['Space partition

In [33]:
# Create a DGL graph
citation_network = dgl.graph( (df['src'], df['dst']) )

citation_network.ndata['title'] = torch.FloatTensor(titles)
citation_network.ndata['abstract'] = torch.FloatTensor(abstracts)
citation_network.ndata['keyword'] = torch.FloatTensor(keywords)
citation_network.ndata['domain'] = torch.FloatTensor(domains)

In [None]:
citation_network.edata['weight'] = torch.FloatTensor(weight)

In [49]:
citation_network.edges()
# src_list, dst_list = citation_network.edges()
# for src, dst in zip(src_list, dst_list):
#     print(src, dst)
    # break

(tensor([ 72456,  81263,  42961,  ...,  30699, 111024,  88782]),
 tensor([ 84909, 139661,  60423,  ..., 137791,  57453, 108899]))

In [62]:
# GAT Layer
class GATLayer(nn.Module):
    def __init__(self, in_dim, out_dim, num_heads=1):
        super(GATLayer, self).__init__()
        self.gatconv = GATConv(in_dim, out_dim, num_heads, allow_zero_in_degree=True)
        
    def forward(self, g, h):
        h = self.gatconv(g, h)
        return h.squeeze(1)

# GAT Model
class GATModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(GATModel, self).__init__()
        self.layer1 = GATLayer(in_dim, hidden_dim)
        self.layer2 = GATLayer(hidden_dim, out_dim)
        
    def forward(self, g, h):
        h = F.relu(self.layer1(g, h))
        h = self.layer2(g, h)
        return h

In [56]:
# Contrastive Loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

In [59]:
# Initialize lists to hold pairs and labels
pairs = []
labels = []

# Loop over edges in the graph to create pairs and labels
for u, v in zip(list(df['src']), list(df['des'])):
    node1 = {'title': titles[u], 'abstract': abstracts[u], 'keywords': keywords[u], 'domain': domains[u]}
    node2 = {'title': titles[v], 'abstract': abstracts[v], 'keywords': keywords[v], 'domain': domains[v]}
    
    similarity = compute_similarity(node1, node2)
    
    if similarity > 0.5:
        labels.append(0)
    else:
        labels.append(1)
        
    pairs.append((u, v))

# Convert pairs and labels to tensors
pairs = torch.LongTensor(pairs)
labels = torch.FloatTensor(labels)

In [68]:
torch.save(pairs, os.path.join(FILTERED_PATH, "gat", "pairs.pt"))
torch.save(labels, os.path.join(FILTERED_PATH, "gat", "labels.pt"))

In [63]:
# Initialize the model and loss
# INPUT: (Feature Dim, Hidden Dim, Output Dim)
model = GATModel(300, 128, 64)
loss_fn = ContrastiveLoss()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(50):
    model.train()
    
    # Forward pass
    h = torch.FloatTensor(citation_network.ndata['title'])
    output = model(citation_network, h)
    
    # Create output1 and output2 based on pairs
    output1 = output[pairs[:, 0]]
    output2 = output[pairs[:, 1]]
    
    # Compute contrastive loss
    loss = loss_fn(output1, output2, labels)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 0.7327643036842346
Epoch 1, Loss: 0.7099007964134216
Epoch 2, Loss: 0.68633633852005
Epoch 3, Loss: 0.6625358462333679
Epoch 4, Loss: 0.6387461423873901
Epoch 5, Loss: 0.6151835918426514


KeyboardInterrupt: 

In [17]:
df = pd.read_csv(os.path.join(FILTERED_PATH, "filtered_nodes.csv"))

0         False
1         False
2         False
3         False
4         False
          ...  
148056    False
148057    False
148058    False
148059    False
148060    False
Name: fos, Length: 148061, dtype: bool

In [21]:
df[df['fos'].isin(['computer science'])]

Unnamed: 0,_id,title,keywords,fos,abstract


In [26]:
fos_list = list(map(eval, df['fos']))

In [38]:
eco = [ idx for idx, fos in enumerate(fos_list) if 'Economics' in fos ]
eco

[45,
 59,
 137,
 163,
 174,
 253,
 282,
 311,
 478,
 493,
 669,
 725,
 759,
 860,
 946,
 948,
 1166,
 1284,
 1294,
 1299,
 1326,
 1493,
 1622,
 1800,
 1869,
 1934,
 2211,
 2274,
 2316,
 2415,
 2439,
 2742,
 2932,
 3071,
 3198,
 3475,
 3783,
 3891,
 3905,
 3942,
 3952,
 4079,
 4192,
 4315,
 4389,
 4487,
 4539,
 4544,
 4620,
 4653,
 4685,
 4747,
 4820,
 4870,
 4909,
 4968,
 5098,
 5172,
 5489,
 5729,
 5733,
 5946,
 6090,
 6278,
 6302,
 6477,
 6585,
 6612,
 6673,
 6818,
 6862,
 7091,
 7258,
 7272,
 7379,
 7380,
 7395,
 7742,
 7791,
 7974,
 7980,
 8097,
 8107,
 8308,
 8366,
 8462,
 8602,
 8606,
 8619,
 8667,
 8911,
 9261,
 9333,
 9384,
 9403,
 9430,
 9526,
 9658,
 9738,
 9973,
 10182,
 10493,
 10877,
 10963,
 10974,
 11318,
 11365,
 11376,
 12010,
 12118,
 12238,
 12555,
 12570,
 12604,
 12611,
 12625,
 12632,
 12966,
 13085,
 13113,
 13258,
 13844,
 13878,
 14035,
 14260,
 14292,
 14384,
 14414,
 14479,
 14765,
 14795,
 15216,
 15369,
 15399,
 15587,
 15683,
 15745,
 15825,
 16029,
 16109

In [44]:
eco_df = df.iloc[eco, :]
eco_df

Unnamed: 0,_id,title,keywords,fos,abstract
45,53e997d1b7602d9701fc3662,Computerized loan origination systems: an indu...,"['electronic market', 'electronic coordination...","['Financial intermediary', 'Incomplete contrac...",Much has been written in recent years about th...
59,53e997d7b7602d9701fcbff2,An EPQ-based inventory model for exponentially...,"['partial trade credit', 'optimal replenishmen...","['Minimization problem', 'Economics', 'Economi...",The main purpose of this paper is to investiga...
137,53e997e3b7602d9701fd8bdf,A jump to default extended CEV model: an appli...,"['equity derivatives', 'default', 'implied vol...","['Econometrics', 'Credit derivative', 'Economi...",We consider the problem of developing a ßexibl...
163,53e997e4b7602d9701fda4d3,Production trade-offs and weight restrictions ...,['data envelope analysis'],"['Information system', 'Economics', 'Mathemati...",In this paper we suggest two equivalent ways i...
174,53e997e4b7602d9701fdb822,DEA Malmquist productivity measure: New insigh...,"['Data envelopment analysis', 'Efficiency', 'M...","['Isoquant', 'Efficiency', 'Econometrics', 'Ec...",Data envelopment analysis (DEA) measures the r...
...,...,...,...,...,...
147576,5c88eb284895d9cbc6a0bb16,"Privacy, economics, and price discrimination o...","['increasing ability', 'public agendum', 'priv...","['Internet privacy', 'Economics', 'Reservation...",The rapid erosion of privacy poses numerous pu...
147675,5c8b3aaa4895d9cbc6780378,Deep learning with long short-term memory netw...,"['Finance', 'Statistical arbitrage', 'LSTM', '...","['Statistical arbitrage', 'Econometrics', 'Eco...",•Application of long short-term memory network...
147877,5cc6fdb36558b90bfa00923c,Horizontal cooperation among freight carriers:...,"['logistics', 'communications technology', 'ma...","['Profit sharing', 'Economics', 'Benefice', 'O...","In modern transportation systems, the potentia..."
147911,5ccd92896558b90bfa49fd11,LIBOR and swap market models and measures,"['self-financing trading strategies', 'stochas...","['Econometrics', 'Economics', 'Financial econo...",. A self-contained theory is presented for p...


In [45]:
eco_df.to_csv('eco_papers.csv')