In [23]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sentence_transformers import util
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from langchain.embeddings import HuggingFaceEmbeddings
from datasets import Dataset
#import torch

PAPERNUM = 1000000

## Data

In [None]:
# Load data
df = pd.read_csv('withRef.csv', nrows=PAPERNUM)
df.head(3)

In [None]:
df.shape

## Graph

In [None]:
# Initialize graph and add nodes
G = nx.DiGraph()
G.add_nodes_from(df['id'].astype(str).tolist())
df['references'] = df['references'].astype(str).apply(lambda x: x.strip().split(';') if x else [])

In [None]:
# Add edges
for _, row in tqdm(df.iterrows(), total=len(df), desc="Building graph"):
    for ref in row['references']:
        if ref in G:
            G.add_edge(row['id'], ref)

In [None]:
# Graph shape
print(len(G.nodes))
print(len(G.edges))

## Link Prediction

In [29]:
# Remove a fraction of edges for testing
def train_test_split_graph(G, test_ratio=0.3, seed=52):
    random.seed(seed)
    edges = list(G.edges())
    num_test = int(len(edges) * test_ratio)
    test_edges = random.sample(edges, num_test)
    train_graph = G.copy()
    train_graph.remove_edges_from(test_edges)
    return train_graph, test_edges

G_train, test_edges = train_test_split_graph(G)


In [30]:
# Pick random node pairs that are not connected to evaluate false positives
def generate_negative_edges(G, num_edges, excluded_edges):
    nodes = list(G.nodes())
    neg_edges = set()
    while len(neg_edges) < num_edges:
        u, v = random.sample(nodes, 2)
        if not G.has_edge(u, v) and (u, v) not in excluded_edges:
            neg_edges.add((u, v))
    return list(neg_edges)

negative_edges = generate_negative_edges(G, len(test_edges), set(test_edges))

In [31]:
# Convert to undirected graphs
G_undirected = G.to_undirected()
G_train_undirected = G_train.to_undirected()

In [None]:
# Common Neighbors
pred_common = [
    (u, v, len(list(nx.common_neighbors(G_train_undirected, u, v))))
    for u, v in test_edges + negative_edges
    if u in G_train_undirected and v in G_train_undirected and len(list(nx.common_neighbors(G_train_undirected, u, v))) > 0
]
print(f"Number of Common Neighbors: {len(pred_common)}")

In [None]:
# Jaccard Coefficient
neighbors = {node: set(G_train_undirected.neighbors(node)) for node in G_train_undirected.nodes()}

def fast_jaccard(u, v):
    if u in neighbors and v in neighbors:
        inter = neighbors[u] & neighbors[v]
        union = neighbors[u] | neighbors[v]
        return (u, v, len(inter) / len(union)) if union else (u, v, 0.0)
    return (u, v, 0.0)

pred_jaccard = [fast_jaccard(u, v) for u, v in tqdm(test_edges + negative_edges, desc="Calculating Jaccard Coefficient", total=len(test_edges) + len(negative_edges))]

In [34]:
# Adamic-Adar Index
pred_adamic = list(nx.adamic_adar_index(G_train_undirected, test_edges + negative_edges))

In [None]:
def evaluate_predictions(preds, true_edges_set):
    y_true = [(u, v) in true_edges_set for u, v, _ in preds]
    y_scores = [score for _, _, score in preds]
    return roc_auc_score(y_true, y_scores)

true_set = set(test_edges)
auc_jaccard = evaluate_predictions(pred_jaccard, true_set)
auc_adamic = evaluate_predictions(pred_adamic, true_set)

# AUC scores
print(f"AUC - Jaccard: {auc_jaccard:.4f}")
print(f"AUC - Adamic-Adar: {auc_adamic:.4f}")

## Link Prediction With LLM

In [None]:
df['id'] = df['id'].astype(str)
hf_dataset = Dataset.from_pandas(df.filter(['id', 'title', 'references'], axis=1))

# Remove unnecessary index column (automatically added by from_pandas)
hf_dataset = hf_dataset.remove_columns(["__index_level_0__"]) if "__index_level_0__" in hf_dataset.column_names else hf_dataset

# Verify dataset structure
print(hf_dataset)

In [37]:
# SBERT embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

def generate_all_embeddings(batch):
    texts = batch["title"]
    embeddings = embedding_model.embed_documents(texts)
    return {"embedding": embeddings}

In [None]:
# Parameters
BATCH_SIZE = 256
num_batches = len(hf_dataset) // BATCH_SIZE + (len(hf_dataset) % BATCH_SIZE > 0)

# Batched processing with tqdm
batched_embeddings = []

for i in tqdm(range(num_batches), desc="Embedding batches"):
    start = i * BATCH_SIZE
    end = min(start + BATCH_SIZE, len(hf_dataset))
    batch = hf_dataset[start:end]
    
    embeddings = generate_all_embeddings(batch)
    batched_embeddings.extend(embeddings["embedding"])


# Add embeddings to the dataset
#hf_dataset = hf_dataset.add_column("embedding", batched_embeddings)

In [None]:
batched_embeddings = np.array(batched_embeddings, dtype=np.float32)

node_embeddings = {
    str(hf_dataset[i]["id"]): np.array(embedding, dtype=np.float32)
    for i, embedding in tqdm(enumerate(batched_embeddings), total=len(batched_embeddings), desc="Building node_embeddings")
}

In [None]:
# Compute similarity from embeddings
def cached_score(u, v):
    if u in node_embeddings and v in node_embeddings:
        return util.cos_sim(node_embeddings[u], node_embeddings[v]).item()
    return 0.5  # fallback for missing nodes

# Generate predictions
llm_preds = [(u, v, cached_score(u, v)) for u, v in tqdm(test_edges + negative_edges, desc="Calculating LLM scores", total=len(test_edges) + len(negative_edges))]

In [None]:
# AUC scores
auc_sbert = evaluate_predictions(llm_preds, set(test_edges))
print(f"AUC - SBERT: {auc_sbert:.4f}")