# Homework 1

In [1]:
#Homework 1
x = "hello world"
print("hello world")
print(x)


hello world
hello world


## Part 1

In [66]:
import torch
import torch_geometric
import matplotlib.pyplot as plt
import networkx as nx
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
from torch_geometric.data import HeteroData
from sklearn.preprocessing import normalize
import pandas as pd
import json
import ast

In [67]:
#import nodes and edges
node_df = pd.read_csv('data/nodes.csv')
#directed graph
edge_df = pd.read_csv('data/train_edges.csv')
#print(node_df.head())
#print(node_df['label'].unique())
#print(edge_df.head())
#print(edge_df['relationship_type'].unique())

#node_df["properties"] = node_df["properties"].apply(ast.literal_eval)
#print(type(node_df['properties'].iloc[0]))

#drop properties column because won't be used
node_df = node_df.drop('properties', axis=1)
#remove annoying quotes and brackets so labels are simpler
node_df['label'] = node_df['label'].str.replace("['", "")
node_df['label'] = node_df['label'].str.replace("']", "")
print(node_df.columns)
print(node_df.head())
print()
print(node_df['label'].value_counts())

print()
print(edge_df['relationship_type'].value_counts())


Index(['id', 'label'], dtype='object')
   id    label
0   0  Dataset
1   1  Dataset
2   2  Dataset
3   3  Dataset
4   4  Dataset

label
Publication       2584
ScienceKeyword    1609
Dataset           1300
Platform           142
Instrument          83
Project             44
DataCenter           1
Name: count, dtype: int64

relationship_type
HAS_SCIENCEKEYWORD    4015
USES_DATASET          3623
SUBCATEGORY_OF        1823
HAS_PLATFORM          1519
OF_PROJECT            1325
HAS_DATASET           1300
HAS_INSTRUMENT         215
Name: count, dtype: int64


In [68]:
data = HeteroData()

# Define node sets by label
for label in node_df['label'].unique():
    node_ids = node_df[node_df['label'] == label]['id'].values #get all nodes for each type of node
    data[label].num_nodes = len(node_ids)
    data[label].x = torch.ones((len(node_ids), 1))  # dummy node features for the given node type

#for each edge relationship type
for relationship_type, relationship_edges in edge_df.groupby('relationship_type'):
    src_nodes_of_type = relationship_edges['source'].values
    dst_nodes_of_type = relationship_edges['target'].values
    src_ids = torch.tensor(src_nodes_of_type, dtype=torch.long)
    dst_ids = torch.tensor(dst_nodes_of_type, dtype=torch.long)

    # Determine node types dynamically
    #print(relationship_edges['source'].iloc[0])
    source_id = relationship_edges['source'].iloc[0]
    src_type = node_df[node_df['id'] == source_id]['label'].values[0]

    target_id = relationship_edges['target'].iloc[0]
    dst_type = node_df[node_df['id'] == target_id]['label'].values[0]

    #creates edge_index tensors for each edge relationship_type
    # .edge_index tensor is of dimensions (2, NumEdges) matrix that PyG needs
    # 2 rows where first row is Src ids and second row is Dst ids for edges
    data[(src_type, relationship_type, dst_type)].edge_index = torch.stack([src_ids, dst_ids], dim=0)

In [69]:
#Summary of Dataset Structure and Key Statistics for Part 1
print("Number of Nodes: ", data.num_nodes)
print("Number of Node features: ", data.num_node_features)
print("Node Types: ", data.node_types)
print()
print("Number of Edges: ", data.num_edges)
print("Edge Types (Src node type, relationship_type, Dst node type): ",  data.edge_types)
print()
print("Has Isolated Nodes: " , data.has_isolated_nodes())
print("Has Self Loops: ", data.has_self_loops())

Number of Nodes:  5763
Number of Node features:  {'Dataset': 1, 'DataCenter': 1, 'Project': 1, 'Platform': 1, 'Instrument': 1, 'Publication': 1, 'ScienceKeyword': 1}
Node Types:  ['Dataset', 'DataCenter', 'Project', 'Platform', 'Instrument', 'Publication', 'ScienceKeyword']

Number of Edges:  13820
Edge Types (Src node type, relationship_type, Dst node type):  [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Platform', 'HAS_INSTRUMENT', 'Instrument'), ('Dataset', 'HAS_PLATFORM', 'Platform'), ('Dataset', 'HAS_SCIENCEKEYWORD', 'ScienceKeyword'), ('Dataset', 'OF_PROJECT', 'Project'), ('ScienceKeyword', 'SUBCATEGORY_OF', 'ScienceKeyword'), ('Publication', 'USES_DATASET', 'Dataset')]

Has Isolated Nodes:  False
Has Self Loops:  False


## Part 2: Link Prediction
### Method #1: Embedding-Based Approach

Task: Apply an embedding-based method for link prediction. ○ Description: Train a model that generates node embeddings, then use those embeddings to predict links. Print relevant metrics.

"The primary goal of the NASA Knowledge Graph is to bridge scientific publications with the datasets they reference, facilitating deeper insights and research opportunities within NASA's scientific and data ecosystem. By organizing these interconnections within a graph structure, this dataset enables advanced analyses, such as discovering influential datasets, understanding research trends, and exploring scientific collaborations."

In [70]:
from collections import defaultdict
def get_metapaths_from_heterodata(data, max_depth=3):
    """
    Generate all metapaths from a PyG HeteroData object using DFS.

    Args:
        data (HeteroData): The heterogeneous graph.
        max_depth (int): Maximum number of edges in each metapath.

    Returns:
        dict[str, list[list[tuple[str, str, str]]]]:
            A dictionary mapping each starting node type to its metapaths.
    """
    # Step 1: Build schema graph (node type -> [(rel, dst_type)])
    graph = defaultdict(list)
    for (src_type, rel_type, dst_type) in data.edge_types:
        graph[src_type].append((rel_type, dst_type))

    # Step 2: DFS to enumerate all metapaths
    def dfs(current_type, path, metapaths):
        if len(path) >= max_depth:
            return

        for rel, next_type in graph[current_type]:
            # Avoid simple cycles (same node type reappearing)
            if next_type in [t for (_, _, t) in path]:
                continue

            new_path = path + [(current_type, rel, next_type)]
            metapaths.append(new_path)
            dfs(next_type, new_path, metapaths)

    # Step 3: Run DFS from each node type
    all_metapaths = {}
    for node_type in data.node_types:
        metapaths = []
        dfs(node_type, [], metapaths)
        all_metapaths[node_type] = metapaths

    return all_metapaths


metapaths = get_metapaths_from_heterodata(data, max_depth=5)



In [76]:
print(metapaths.keys())
#print(metapaths)
print()
for k,v in metapaths.items():
    print(f"metapaths[{k}] : {metapaths[k]}")
          

dict_keys(['Dataset', 'DataCenter', 'Project', 'Platform', 'Instrument', 'Publication', 'ScienceKeyword'])

metapaths[Dataset] : [[('Dataset', 'HAS_PLATFORM', 'Platform')], [('Dataset', 'HAS_PLATFORM', 'Platform'), ('Platform', 'HAS_INSTRUMENT', 'Instrument')], [('Dataset', 'HAS_SCIENCEKEYWORD', 'ScienceKeyword')], [('Dataset', 'OF_PROJECT', 'Project')]]
metapaths[DataCenter] : [[('DataCenter', 'HAS_DATASET', 'Dataset')], [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset', 'HAS_PLATFORM', 'Platform')], [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset', 'HAS_PLATFORM', 'Platform'), ('Platform', 'HAS_INSTRUMENT', 'Instrument')], [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset', 'HAS_SCIENCEKEYWORD', 'ScienceKeyword')], [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset', 'OF_PROJECT', 'Project')]]
metapaths[Project] : []
metapaths[Platform] : [[('Platform', 'HAS_INSTRUMENT', 'Instrument')]]
metapaths[Instrument] : []
metapaths[Publication] : [[('Publication', 'USES_DATASET'

Idea: Metapath will be linking Publications that use Datasets that are of Projects. If we can predict links between Publications and Projects that we think would be linked, then we can connect author's of the publications with new Projects that should already be linked due to similar fields/common interests. <br>
Publication USES_DATASET Dataset, Dataset OF_PROJECT Project <br>
Could also try ('Publication', 'USES_DATASET', 'Dataset'), ('Dataset', 'HAS_SCIENCEKEYWORD', 'ScienceKeyword') ? 

https://pytorch-geometric.readthedocs.io/en/2.6.0/tutorial/shallow_node_embeddings.
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.MetaPath2Vec.html

In [79]:
#Generating Embeddings using MetaPath2Vec package from PyG

from torch_geometric.nn import MetaPath2Vec

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

metapath = [('Publication', 'USES_DATASET', 'Dataset'), ('Dataset', 'HAS_PLATFORM', 'Platform'), ('Platform', 'HAS_INSTRUMENT', 'Instrument')]

model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128,
                     metapath=metapath, walk_length=3, context_size=2,
                     walks_per_node=5, num_negative_samples=5,
                     sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=6)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [None]:
def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                  f'Loss: {total_loss / log_steps:.4f}')
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                  f'Acc: {acc:.4f}')

@torch.no_grad()
def test(train_ratio=0.1):
    model.eval()

    z = model('author', batch=data['author'].y_index.to(device))
    y = data['author'].y

    perm = torch.randperm(z.size(0))
    train_perm = perm[:int(z.size(0) * train_ratio)]
    test_perm = perm[int(z.size(0) * train_ratio):]

    return model.test(z[train_perm], y[train_perm], z[test_perm], y[test_perm],
                      max_iter=150)


for epoch in range(1, 6):
    train(epoch)
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

### Method 2: Alternative Approach 
Task: Choose and implement another link prediction method. ○ Description: This method should not use embeddings. You can use any approach of your choice. Compare the performance of this method with the embedding-based method.