In [1]:
import json
with open('/data/jx4237data/TKG/new_TKG/TKG_new_BAI/paper_nodes.json', 'r') as file:
    paper_nodes = json.load(file)
len(paper_nodes)

1792034

In [2]:
core_paper_nodes = {}
for k,v in paper_nodes.items():
    if v['features']['is_core'] == 1:
        core_paper_nodes[k] = v

In [3]:
pmids = set(list(core_paper_nodes.keys()))

In [4]:
import pickle
papers= pickle.load(open('/data/jx4237data/TKG/new_TKG/Bridge2AI_s2_paper_results.pkl','rb'))

In [None]:
corepaper_fields = {}
for paper in papers:
    if 'PubMed' in paper['externalIds']:
        if paper['externalIds']['PubMed'] in pmids:
            corepaper_fields[paper['externalIds']['PubMed']] = paper['fieldsOfStudy']
preference_order = ['Others', 'Computer Science', 'Biology', 'Medicine']

# Function to assign a single label based on the preference order
def assign_single_label(labels):
    # Define the categories
    categories = ['Computer Science', 'Biology', 'Medicine']
    # Convert labels to the 4 categories
    converted_labels = {label if label in categories else 'Others' for label in labels}
    # Apply the preference order
    for pref in preference_order:
        if pref in converted_labels:
            return pref

# Assign single labels to each paper
single_labels = {paper_id: assign_single_label(labels) for paper_id, labels in corepaper_fields.items()}
single_labels

In [6]:
# Dictionaries to store the mappings with paper_id in their names
old_paper_id_to_new_acm_map = {}
new_acm_paper_id_to_old_map = {}

# Iterate over the dictionary and create the mappings
for new_paper_id, old_paper_id in enumerate(single_labels):
    old_paper_id_to_new_acm_map[old_paper_id] = new_paper_id
    new_acm_paper_id_to_old_map[new_paper_id] = old_paper_id

# Create the new dictionary with the new keys
new_acm_core_paper_nodes = {new_paper_id: core_paper_nodes[old_paper_id] for old_paper_id, new_paper_id in old_paper_id_to_new_acm_map.items()}

In [9]:
new_author_set = set()
for k,v in new_acm_core_paper_nodes.items():
    for old_author in v['neighbors']['author']:
        new_author_set.add(old_author)
new_author_list = list(new_author_set)

In [10]:
old_author_id_to_new_acm_map = {}
new_acm_author_id_to_old_map = {}
# Iterate over the dictionary and create the mappings
for new_author_id, old_author_id in enumerate(new_author_list):
    old_author_id_to_new_acm_map[old_author_id] = new_author_id
    new_acm_author_id_to_old_map[new_author_id] = old_author_id

In [11]:
author_write_paper_author = []
author_write_paper_paper = []
for k,v in new_acm_core_paper_nodes.items():
    for old_author_id in v['neighbors']['author'].keys():
        author_write_paper_paper.append(k)
        author_write_paper_author.append(old_author_id_to_new_acm_map[old_author_id])

In [15]:
venue_set = set()
for k,v in new_acm_core_paper_nodes.items():
    venue_set.update([v['features']['Venue']])
venue_list = list(venue_set)
old_venue_id_to_new_acm_map = {}
new_acm_venue_id_to_old_map = {}

# Iterate over the list and create the mappings
for new_venue_id, old_venue_id in enumerate(venue_list):
    old_venue_id_to_new_acm_map[old_venue_id] = new_venue_id
    new_acm_venue_id_to_old_map[new_venue_id] = old_venue_id

paper_venue_paper_paper =[]
paper_venue_paper_venue =[]
for k,v in new_acm_core_paper_nodes.items():
    paper_venue_paper_paper.append(k)
    paper_venue_paper_venue.append(old_venue_id_to_new_acm_map[v['features']['Venue']])

In [19]:
import torch
data_dict = {
    ('author', 'author-paper', 'paper'): (torch.tensor(author_write_paper_author), torch.tensor(author_write_paper_paper)),
    ('paper', 'paper-author', 'author'): (torch.tensor(author_write_paper_paper), torch.tensor(author_write_paper_author)),
    ('paper', 'paper-subject', 'subject'): (torch.tensor(paper_venue_paper_paper), torch.tensor(paper_venue_paper_venue)),
    ('subject', 'subject-paper', 'paper'):(torch.tensor(paper_venue_paper_venue), torch.tensor(paper_venue_paper_paper)),
}

In [21]:
import dgl
g = dgl.heterograph(data_dict)

  from .autonotebook import tqdm as notebook_tqdm


Graph(num_nodes={'author': 35750, 'paper': 8107, 'subject': 1703},
      num_edges={('author', 'author-paper', 'paper'): 79558, ('paper', 'paper-author', 'author'): 79558, ('paper', 'paper-subject', 'subject'): 8107, ('subject', 'subject-paper', 'paper'): 8107},
      metagraph=[('author', 'paper', 'author-paper'), ('paper', 'author', 'paper-author'), ('paper', 'subject', 'paper-subject'), ('subject', 'paper', 'subject-paper')])

In [23]:
import torch
import dgl
from torch.optim import SparseAdam
from torch.utils.data import DataLoader
from dgl.nn.pytorch import MetaPath2Vec

model = MetaPath2Vec(g, emb_dim=128, window_size=1,metapath=['author-paper', 'paper-subject', 'subject-paper','paper-author'])

100%|██████████| 35750/35750 [00:12<00:00, 2784.56it/s]


In [24]:
dataloader = DataLoader(torch.arange(g.num_nodes('author')), batch_size=128,
                        shuffle=True, collate_fn=model.sample)
optimizer = SparseAdam(model.parameters(), lr=0.025)

In [25]:
from tqdm import tqdm
for (pos_u, pos_v, neg_v) in tqdm(dataloader):
    loss = model(pos_u, pos_v, neg_v)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

100%|██████████| 280/280 [00:03<00:00, 70.95it/s] 


In [33]:
author_nids = torch.LongTensor(model.local_to_global_nid['author'])
g.nodes['author'].data['h'] = model.node_embed(author_nids)

In [41]:
g.node_attr_schemes('paper')

{'h': Scheme(shape=(128,), dtype=torch.float32),
 'label': Scheme(shape=(), dtype=torch.int64)}

In [38]:
field2label = {'Biology':0, 'Computer Science':1, 'Medicine':2, 'Others':3}
g.nodes['paper'].data['label'] = torch.tensor(list({old_paper_id_to_new_acm_map[k]: field2label[v] for k,v in single_labels.items()}.values()))

In [42]:
g.nodes['paper'].data['label']

tensor([0, 0, 1,  ..., 1, 0, 3])

In [43]:
g

Graph(num_nodes={'author': 35750, 'paper': 8107, 'subject': 1703},
      num_edges={('author', 'author-paper', 'paper'): 79558, ('paper', 'paper-author', 'author'): 79558, ('paper', 'paper-subject', 'subject'): 8107, ('subject', 'subject-paper', 'paper'): 8107},
      metagraph=[('author', 'paper', 'author-paper'), ('paper', 'author', 'paper-author'), ('paper', 'subject', 'paper-subject'), ('subject', 'paper', 'subject-paper')])

In [47]:
from dgl.data.utils import save_graphs
save_graphs('/data/jx4237data/TKG/new_TKG/TKG_new_BAI/630deliverable/OpenHGNN/openhgnn/dataset/acm4NSHE/graph.bin',g)

In [45]:
from dgl.data.utils import load_graphs
gns, _ = load_graphs('/data/jx4237data/TKG/new_TKG/TKG_new_BAI/630deliverable/OpenHGNN/openhgnn/dataset/acm4NSHE/graph.bin')

In [46]:
gns

[Graph(num_nodes={'author': 7167, 'paper': 4019, 'subject': 60},
       num_edges={('author', 'author-paper', 'paper'): 13407, ('paper', 'paper-author', 'author'): 13407, ('paper', 'paper-subject', 'subject'): 4019, ('subject', 'subject-paper', 'paper'): 4019},
       metagraph=[('author', 'paper', 'author-paper'), ('paper', 'author', 'paper-author'), ('paper', 'subject', 'paper-subject'), ('subject', 'paper', 'subject-paper')])]

In [49]:
old_author_id_to_new_acm_map

{'5244129': 0,
 '6863808': 1,
 '13273237': 2,
 '7982041': 3,
 '11777553': 4,
 '4202204': 5,
 '1464745': 6,
 '6144340': 7,
 '7409249': 8,
 '4768779': 9,
 '2103144': 10,
 '2109877': 11,
 '12610835': 12,
 '6268892': 13,
 '4847238': 14,
 '4992677': 15,
 '8344857': 16,
 '13472018': 17,
 '9943736': 18,
 '3236271': 19,
 '10205646': 20,
 '6295437': 21,
 '639434': 22,
 '9755292': 23,
 '10301711': 24,
 '2981244': 25,
 '9689012': 26,
 '635129': 27,
 '6184213': 28,
 '6450774': 29,
 '6596196': 30,
 '13312502': 31,
 '8145625': 32,
 '9116555': 33,
 '100000044': 34,
 '12448270': 35,
 '9492817': 36,
 '1771099': 37,
 '13003808': 38,
 '9077147': 39,
 '3447011': 40,
 '6525733': 41,
 '3069515': 42,
 '10942885': 43,
 '2542118': 44,
 '2994936': 45,
 '586895': 46,
 '5103936': 47,
 '3799230': 48,
 '9392617': 49,
 '5708095': 50,
 '12474113': 51,
 '6644829': 52,
 '4704568': 53,
 '8987230': 54,
 '9327336': 55,
 '1129782': 56,
 '3460102': 57,
 '4838556': 58,
 '13079892': 59,
 '3456054': 60,
 '8945593': 61,
 '23979

In [50]:
import json
with open('/data/jx4237data/TKG/new_TKG/TKG_new_BAI/author_nodes.json', 'r') as file:
    author_nodes = json.load(file)
len(author_nodes)
old_author_id_to_new_map = {}
new_author_id_to_old_map = {}

# Iterate over the dictionary and create the mappings
for new_author_id, old_author_id in enumerate(author_nodes):
    old_author_id_to_new_map[old_author_id] = new_author_id
    new_author_id_to_old_map[new_author_id] = old_author_id

In [76]:
old_tensor2new_tensor = {}
for k,v in new_author_id_to_old_map.items():
    if v in old_author_id_to_new_acm_map:
        old_tensor2new_tensor[k] = old_author_id_to_new_acm_map[v]
    else:
        print(k,v)
        old_tensor2new_tensor[k] = 35750

10171 13993952


In [None]:
new_tensor2oldtensor = {}
for k,v in sorted(old_tensor2new_tensor.items(), key= lambda x:x[1]):
    new_tensor2oldtensor[v] = k
new_tensor2oldtensor

In [80]:
import pickle
pickle.dump(new_tensor2oldtensor, open('/data/jx4237data/TKG/new_TKG/TKG_new_BAI/630deliverable/new_tensor2oldtensor.pkl','wb+'))

# spector AGG

In [82]:
from dgl.data.utils import load_graphs
g = load_graphs('tkg_dgl.bin')

In [83]:
import json
import torch
with open('paper_ebd.json', 'r') as json_file:
    paper_ebd_serializable = json.load(json_file)

# Convert lists back to tensors
paper_ebd = {
    int(key): torch.tensor(value) for key, value in paper_ebd_serializable.items()
}
ids, embeddings = zip(*[(int(k), v) for k, v in sorted(paper_ebd.items(), key=lambda x: int(x[0]))])
ids = torch.tensor(ids)
embeddings = torch.stack(embeddings)

In [84]:
embeddings.shape

torch.Size([12065, 768])

In [86]:
g[0][0].nodes['paper'].data['abstract'] = embeddings

In [89]:
g[0][0]

Graph(num_nodes={'author': 35751, 'paper': 12065, 'venue': 2244},
      num_edges={('author', 'author-paper', 'paper'): 95996, ('paper', 'cite', 'paper'): 28561, ('paper', 'paper-author', 'author'): 95996, ('paper', 'paper-venue', 'venue'): 12065, ('venue', 'venue-paper', 'paper'): 12065},
      metagraph=[('author', 'paper', 'author-paper'), ('paper', 'paper', 'cite'), ('paper', 'author', 'paper-author'), ('paper', 'venue', 'paper-venue'), ('venue', 'paper', 'venue-paper')])

In [90]:
g[0][0].node_attr_schemes('paper')

{'reference': Scheme(shape=(768,), dtype=torch.float32),
 'venue': Scheme(shape=(768,), dtype=torch.float32),
 'author': Scheme(shape=(768,), dtype=torch.float32),
 'abstract': Scheme(shape=(768,), dtype=torch.float32),
 'dw_embedding': Scheme(shape=(768,), dtype=torch.float32)}

In [91]:
from tqdm import tqdm
g = g[0][0]
paper_embeddings = g.nodes['paper'].data['abstract']

# Initialize an empty tensor to store the aggregated embeddings for authors
author_embeddings = torch.zeros((g.num_nodes('author'), paper_embeddings.shape[1]))

# Perform the aggregation
for author in tqdm(range(g.num_nodes('author'))):
    # Get all papers written by this author
    paper_ids = g.successors(author, etype='author-paper')
    if len(paper_ids) > 0:
        # Aggregate paper embeddings (e.g., mean)
        author_embeddings[author] = paper_embeddings[paper_ids].mean(dim=0)
    else:
        print('impossible')

# Store the aggregated embeddings as an attribute for the author nodes
g.nodes['author'].data['agg'] = author_embeddings

100%|██████████| 35751/35751 [00:05<00:00, 6858.45it/s]


In [98]:
g.node_attr_schemes('author')

{'abstract': Scheme(shape=(2304,), dtype=torch.float32),
 'dw_embedding': Scheme(shape=(768,), dtype=torch.float32),
 'agg': Scheme(shape=(768,), dtype=torch.float32)}

In [None]:
g.nodes['author'].data.pop('dw_embedding', None)

In [102]:
with open('/data/jx4237data/TKG/new_TKG/TKG_new_BAI/630deliverable/test/author_embeddings.pkl', 'wb') as f:
    pickle.dump(author_embeddings, f)