# Homework 1

## Part 1

In [None]:
import torch
import torch_geometric
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
from torch_geometric.data import HeteroData
from sklearn.preprocessing import normalize
import pandas as pd


In [None]:
#import nodes and edges
node_df = pd.read_csv('data/nodes.csv')
#directed graph
edge_df = pd.read_csv('data/train_edges.csv')

#drop properties column because won't be used
node_df = node_df.drop('properties', axis=1)
#remove quotes and brackets so labels are simpler
node_df['label'] = node_df['label'].str.replace("['", "")
node_df['label'] = node_df['label'].str.replace("']", "")
print(node_df.columns)
print(node_df.head())
print()
print(node_df['label'].value_counts())

print()
print(edge_df['relationship_type'].value_counts())


Index(['id', 'label'], dtype='object')
   id    label
0   0  Dataset
1   1  Dataset
2   2  Dataset
3   3  Dataset
4   4  Dataset

label
Publication       2584
ScienceKeyword    1609
Dataset           1300
Platform           142
Instrument          83
Project             44
DataCenter           1
Name: count, dtype: int64

relationship_type
HAS_SCIENCEKEYWORD    4015
USES_DATASET          3623
SUBCATEGORY_OF        1823
HAS_PLATFORM          1519
OF_PROJECT            1325
HAS_DATASET           1300
HAS_INSTRUMENT         215
Name: count, dtype: int64


In [4]:
data = HeteroData()

# Define node sets by label
for label in node_df['label'].unique():
    node_ids = node_df[node_df['label'] == label]['id'].values #get all nodes for each type of node
    data[label].num_nodes = len(node_ids)
    data[label].x = torch.ones((len(node_ids), 1))  # dummy node features for the given node type

#for each edge relationship type
for relationship_type, relationship_edges in edge_df.groupby('relationship_type'):
    src_nodes_of_type = relationship_edges['source'].values
    dst_nodes_of_type = relationship_edges['target'].values
    src_ids = torch.tensor(src_nodes_of_type, dtype=torch.long)
    dst_ids = torch.tensor(dst_nodes_of_type, dtype=torch.long)

    # Determine node types dynamically
    #print(relationship_edges['source'].iloc[0])
    source_id = relationship_edges['source'].iloc[0]
    src_type = node_df[node_df['id'] == source_id]['label'].values[0]

    target_id = relationship_edges['target'].iloc[0]
    dst_type = node_df[node_df['id'] == target_id]['label'].values[0]

    #creates edge_index tensors for each edge relationship_type
    # .edge_index tensor is of dimensions (2, NumEdges) matrix that PyG needs
    # 2 rows where first row is Src ids and second row is Dst ids for edges
    data[(src_type, relationship_type, dst_type)].edge_index = torch.stack([src_ids, dst_ids], dim=0)

In [5]:
#Summary of Dataset Structure and Key Statistics for Part 1
print("Number of Nodes: ", data.num_nodes)
print("Number of Node features: ", data.num_node_features)
print("Node Types: ", data.node_types)
print()
print("Number of Edges: ", data.num_edges)
print("Edge Types (Src node type, relationship_type, Dst node type): ",  data.edge_types)
print()
print("Has Isolated Nodes: " , data.has_isolated_nodes())
print("Has Self Loops: ", data.has_self_loops())

Number of Nodes:  5763
Number of Node features:  {'Dataset': 1, 'DataCenter': 1, 'Project': 1, 'Platform': 1, 'Instrument': 1, 'Publication': 1, 'ScienceKeyword': 1}
Node Types:  ['Dataset', 'DataCenter', 'Project', 'Platform', 'Instrument', 'Publication', 'ScienceKeyword']

Number of Edges:  13820
Edge Types (Src node type, relationship_type, Dst node type):  [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Platform', 'HAS_INSTRUMENT', 'Instrument'), ('Dataset', 'HAS_PLATFORM', 'Platform'), ('Dataset', 'HAS_SCIENCEKEYWORD', 'ScienceKeyword'), ('Dataset', 'OF_PROJECT', 'Project'), ('ScienceKeyword', 'SUBCATEGORY_OF', 'ScienceKeyword'), ('Publication', 'USES_DATASET', 'Dataset')]

Has Isolated Nodes:  False
Has Self Loops:  False


## Part 2: Link Prediction
### Method #1: Embedding-Based Approach

Task: Apply an embedding-based method for link prediction. ○ Description: Train a model that generates node embeddings, then use those embeddings to predict links. Print relevant metrics.

"The primary goal of the NASA Knowledge Graph is to bridge scientific publications with the datasets they reference, facilitating deeper insights and research opportunities within NASA's scientific and data ecosystem. By organizing these interconnections within a graph structure, this dataset enables advanced analyses, such as discovering influential datasets, understanding research trends, and exploring scientific collaborations."

In [6]:
import networkx as nx
from torch_geometric.data import HeteroData

def build_schema_graph(data: HeteroData):
    """
    Build a directed schema graph (node types as nodes, relation types as edges)
    from a PyG HeteroData object.
    """
    schema = nx.DiGraph()
    
    for src_type, rel_type, dst_type in data.edge_types:
        schema.add_edge(src_type, dst_type, relation=rel_type)
    
    return schema


def find_metapaths(schema, start_type, max_length=5):
    """
    Find all meta-paths in the schema graph that start and end
    at the same node type.
    """
    paths = []

    def dfs(current, path):
        if len(path) > max_length:
            return
        # if we've returned to the start type (and it's not the first node)
        if len(path) > 1 and current == start_type:
            paths.append(list(path))
        for neighbor in schema.neighbors(current):
            path.append(neighbor)
            dfs(neighbor, path)
            path.pop()

    dfs(start_type, [start_type])
    return paths


# ---- Example usage ----
# Assuming you already have your HeteroData object called `data`

schema = build_schema_graph(data)


metapaths = find_metapaths(schema, start_type='Publication', max_length=5)
print(f"Found {len(metapaths)} meta-paths starting and ending at 'Publication':")
for mp in metapaths:
    print(" → ".join(mp))

metapaths = find_metapaths(schema, start_type='ScienceKeyword', max_length=5)
print(f"Found {len(metapaths)} meta-paths starting and ending at 'ScienceKeyword':")
for mp in metapaths:
    print(" → ".join(mp))

Found 0 meta-paths starting and ending at 'Publication':
Found 4 meta-paths starting and ending at 'ScienceKeyword':
ScienceKeyword → ScienceKeyword
ScienceKeyword → ScienceKeyword → ScienceKeyword
ScienceKeyword → ScienceKeyword → ScienceKeyword → ScienceKeyword
ScienceKeyword → ScienceKeyword → ScienceKeyword → ScienceKeyword → ScienceKeyword


Idea: If there were metapaths that started at Publication and ended at Publication, we could use these manually defined paths to look at Publications with similar embeddings and infer that the Publications were in the same topic area. If we can predict links between Publications and Projects that we think would be linked, then we can connect author's of the Publications with new Projects that should already be linked due to similar fields/common interests. <br>

https://pytorch-geometric.readthedocs.io/en/2.6.0/tutorial/shallow_node_embeddings.
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.MetaPath2Vec.html

Since there are no metapaths that start and end at the same node type besides the ScienceKeyword node type, I am going to use regular Node2Vec.  

In [27]:
#Generating Embeddings using Node2Vec package from PyG

from torch_geometric.nn import Node2Vec

homog_data = data.to_homogeneous()
print("Num Nodes: ", homog_data.num_nodes)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model = Node2Vec(homog_data.edge_index, embedding_dim=128, walk_length=10, context_size=5, walks_per_node=10, sparse=True).to(device)

Num Nodes:  5763


In [28]:
# Assuming your edges are in homo_data.edge_index
edge_index = homog_data.edge_index

# Step 1. Get unique node IDs actually used in the graph
unique_nodes = torch.unique(edge_index)

# Step 2. Build a mapping from old → new ID
id_map = {old.item(): new for new, old in enumerate(unique_nodes)}

# Step 3. Remap edge_index
mapped_edges = torch.tensor([
    [id_map[src.item()] for src in edge_index[0]],
    [id_map[dst.item()] for dst in edge_index[1]]
], dtype=torch.long)

print(f"Old num nodes: {edge_index.max().item() + 1}")
print(f"New num nodes: {len(unique_nodes)}")

model = Node2Vec(mapped_edges, embedding_dim=128, walk_length=20, context_size=10, walks_per_node=10)
print(model)

loader = model.loader(batch_size=128, shuffle=True)
optimizer = torch.optim.Adam(list(model.parameters()), lr=0.01)

Old num nodes: 29178
New num nodes: 5763
Node2Vec(5763, 128)


In [29]:
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, 25):
    loss = train()
    #acc = test()
    #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 5.3660
Epoch: 002, Loss: 4.0846
Epoch: 003, Loss: 3.1239
Epoch: 004, Loss: 2.4048
Epoch: 005, Loss: 1.9087
Epoch: 006, Loss: 1.5544
Epoch: 007, Loss: 1.3038
Epoch: 008, Loss: 1.1337
Epoch: 009, Loss: 1.0144
Epoch: 010, Loss: 0.9336
Epoch: 011, Loss: 0.8804
Epoch: 012, Loss: 0.8434
Epoch: 013, Loss: 0.8164
Epoch: 014, Loss: 0.7996
Epoch: 015, Loss: 0.7839
Epoch: 016, Loss: 0.7740
Epoch: 017, Loss: 0.7658
Epoch: 018, Loss: 0.7596
Epoch: 019, Loss: 0.7550
Epoch: 020, Loss: 0.7506
Epoch: 021, Loss: 0.7472
Epoch: 022, Loss: 0.7448
Epoch: 023, Loss: 0.7421
Epoch: 024, Loss: 0.7405


In [31]:
import torch.nn.functional as F

embeddings = model()
print(model)

#import validation and test edges
val_links = pd.read_csv('data/val_links.csv')
val_links['source'] = val_links['source'].map(id_map)
val_links['target'] = val_links['target'].map(id_map)
test_links = pd.read_csv('data/test_links.csv')
test_links['source'] = test_links['source'].map(id_map)
test_links['target'] = test_links['target'].map(id_map)

#convert validation and test CSVs to tensors
val_src = torch.tensor(val_links['source'].values, dtype=torch.long)
val_dst = torch.tensor(val_links['target'].values, dtype=torch.long)

test_src = torch.tensor(test_links['source'].values, dtype=torch.long)
test_dst = torch.tensor(test_links['target'].values, dtype=torch.long)

missing_src = set(val_links['source']) - set(id_map.keys())
missing_dst = set(val_links['target']) - set(id_map.keys())
print("Missing source nodes:", missing_src)
print("Missing target nodes:", missing_dst)

#compute dot products between src and dst node embeddings for similarity score
val_scores = (embeddings[val_src] * embeddings[val_dst]).sum(dim=1)
test_scores = (embeddings[test_src] * embeddings[test_dst]).sum(dim=1)

#wrap in sigmoid to get probabilities for each src and dst pair
val_probs = torch.sigmoid(val_scores)
test_probs = torch.sigmoid(test_scores)

Node2Vec(5763, 128)
Missing source nodes: set()
Missing target nodes: {nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 4120.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 4142.0, nan, nan, nan, 4149.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 

IndexError: index -9223372036854775808 is out of bounds for dimension 0 with size 5763

### Method 2: Alternative Approach 
Task: Choose and implement another link prediction method. ○ Description: This method should not use embeddings. You can use any approach of your choice. Compare the performance of this method with the embedding-based method.