In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch

import dgl
from ogb.nodeproppred import DglNodePropPredDataset

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [42]:
# Load the dataset
dataset = DglNodePropPredDataset(name='ogbn-arxiv')

# Get the split indices
split_idx = dataset.get_idx_split()

# Access node indices for training, validation, and testing
train_nodeidx = split_idx['train']
valid_nodeidx = split_idx['valid']
test_nodeidx = split_idx['test']

print("Training node indices:", train_nodeidx)
print("Validation node indices:", valid_nodeidx)
print("Testing node indices:", test_nodeidx)

Training node indices: tensor([     0,      1,      2,  ..., 169145, 169148, 169251])
Validation node indices: tensor([   349,    357,    366,  ..., 169185, 169261, 169296])
Testing node indices: tensor([   346,    398,    451,  ..., 169340, 169341, 169342])


In [73]:
# From the dataset folder load the node index to paper id table
node_idx = pd.read_csv('data/nodeidx2paperid.csv.gz', sep=',', compression='gzip', header=0, names=['node_idx', 'paper_id'])
node_idx['paper_id'] = node_idx['paper_id'].astype(np.int64)

# From https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv download the the mapping from MAG paper IDs into the raw texts of titles and abstracts
# https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz

idx_title = pd.read_csv('data/titleabs.tsv.gz', sep='\t', compression='gzip', header=0, names=['title', 'abstract'], dtype={'title': str, 'abstract': str})

df = node_idx.merge(idx_title.reset_index(), left_on='paper_id', right_on='index', how='inner')

Unnamed: 0.1,Unnamed: 0,paper_id,title
0,0,9657784,evasion attacks against machine learning at te...
1,1,39886162,how hard is computing parity with noisy commun...
2,2,116214155,on the absence of the rip in real world applic...
3,3,121432379,a promise theory perspective on data networks
4,4,231147053,analysis of asymptotically optimal sampling ba...


In [74]:
input_texts = df['title'].to_list()

In [76]:
from tqdm import tqdm

model = SentenceTransformer('intfloat/intfloat/multilingual-e5-small')
batch_size = 64  # Adjust based on available memory
embeddings = []

for i in tqdm(range(0, len(input_texts), batch_size), desc="Encoding Progress"):
    batch = input_texts[i:i + batch_size]
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    embeddings.extend(batch_embeddings)


Encoding Progress: 100%|██████████| 2646/2646 [1:05:07<00:00,  1.48s/it]


In [80]:
import torch

# Convert to PyTorch tensor
tensor = torch.tensor(embeddings)

print(tensor)

tensor([[ 0.0495, -0.0107, -0.0149,  ...,  0.0617, -0.0043,  0.0846],
        [ 0.0454, -0.0288, -0.0727,  ...,  0.0269, -0.0055,  0.0495],
        [ 0.0330, -0.0021, -0.0371,  ...,  0.0565, -0.0254,  0.0652],
        ...,
        [ 0.0843, -0.0149, -0.0347,  ...,  0.0852, -0.0027,  0.0334],
        [ 0.0477, -0.0211, -0.0180,  ...,  0.0446,  0.0080,  0.0627],
        [ 0.0439, -0.0191, -0.0517,  ...,  0.0535,  0.0260,  0.0957]])


  tensor = torch.tensor(embeddings)


In [81]:
tensor.shape

torch.Size([169343, 384])

In [82]:
import pickle
with open("arvix_e5_small_embedding.pkl", "wb") as f:
    pickle.dump(tensor, f)
