# Homework 1

## Part 1

In [59]:
import torch
from torch_geometric.data import Data
import pandas as pd


In [61]:
#import nodes and edges
node_df = pd.read_csv('data/nodes.csv')
edge_df = pd.read_csv('data/train_edges.csv')
val_links = pd.read_csv('data/val_links.csv')
test_links = pd.read_csv('data/test_links.csv')

#print(node_df.head())
#print(node_df['label'].unique())
#print(edge_df.head())
#print(edge_df['relationship_type'].unique())

#node_df["properties"] = node_df["properties"].apply(ast.literal_eval)
#print(type(node_df['properties'].iloc[0]))

#drop properties column because won't be used
node_df = node_df.drop('properties', axis=1)
#remove quotes and brackets so labels are simpler
node_df['label'] = node_df['label'].str.replace("['", "")
node_df['label'] = node_df['label'].str.replace("']", "")
print(node_df.columns)
print(node_df.head())
#Since Validation and Test sets only have Dataset and ScienceKeyword nodes and HAS_SCIENCEKEYWORD relationship_types, filter nodes.csv and train_edges.csv to only these node 'label's and this 'relationship_type'
node_df = node_df[(node_df['label'] == 'Dataset') | (node_df['label'] == 'ScienceKeyword')]
edge_df = edge_df[edge_df['relationship_type'] == 'HAS_SCIENCEKEYWORD']
print()
print(node_df['label'].value_counts())

print()
print(edge_df['relationship_type'].value_counts())

print()
print("Number of training examples: ", len(edge_df))
print("Number of validation examples: ", len(val_links))
print("Number of test examples: ", len(test_links))


Index(['id', 'label'], dtype='object')
   id    label
0   0  Dataset
1   1  Dataset
2   2  Dataset
3   3  Dataset
4   4  Dataset

label
ScienceKeyword    1609
Dataset           1300
Name: count, dtype: int64

relationship_type
HAS_SCIENCEKEYWORD    4015
Name: count, dtype: int64

Number of training examples:  4015
Number of validation examples:  860
Number of test examples:  861


**Important: - `val_links.csv`: Contains `HAS_SCIENCEKEYWORD` edges for validation.`test_links.csv`: Contains `HAS_SCIENCEKEYWORD` edges for testing.** <br>
So, the validation and test sets are only looking at relationship type Dataset -> ScienceKeyword. 

In [62]:
# Map node IDs to contiguous indices for PyG
node_df['idx'] = range(len(node_df))
id_to_idx = dict(zip(node_df['id'], node_df['idx']))

ones = torch.ones((len(node_df), 1))  # dummy node features for each node
# Map node IDs to indices
edge_df['source'] = edge_df['source'].map(id_to_idx)
edge_df['target'] = edge_df['target'].map(id_to_idx)

used_nodes = pd.unique(edge_df[['source','target']].values.ravel())
node_df = node_df[node_df['idx'].isin(used_nodes)].reset_index(drop=True)

# Remap node IDs to contiguous indices
node_df['idx'] = range(len(node_df))
id_to_idx = dict(zip(node_df['id'], node_df['idx']))

# Remap edges
edge_df['source'] = edge_df['source'].map(id_to_idx)
edge_df['target'] = edge_df['target'].map(id_to_idx)


val_links['source'] = val_links['source'].map(id_to_idx)
val_links['target'] = val_links['target'].map(id_to_idx)

test_links['source'] = test_links['source'].map(id_to_idx)
test_links['target'] = test_links['target'].map(id_to_idx)


# Create edge_index tensor
edge_index = torch.tensor(edge_df[['source', 'target']].values, dtype=torch.long).t().contiguous()

data = Data(x=ones, edge_index=edge_index)
print(data)

Data(x=[2909, 1], edge_index=[2, 4015])


In [63]:
#Summary of Dataset Structure and Key Statistics for Part 1
print("Number of Nodes: ", data.num_nodes)
print("Number of Node features: ", data.num_node_features)
print()
print("Number of Edges: ", data.num_edges)
print()
print("Has Isolated Nodes: " , data.has_isolated_nodes())
print("Has Self Loops: ", data.has_self_loops())

Number of Nodes:  2909
Number of Node features:  1

Number of Edges:  4015

Has Isolated Nodes:  True
Has Self Loops:  False


## Part 2: Link Prediction
### Method #1: Embedding-Based Approach

Task: Apply an embedding-based method for link prediction. ○ Description: Train a model that generates node embeddings, then use those embeddings to predict links. Print relevant metrics.

"The primary goal of the NASA Knowledge Graph is to bridge scientific publications with the datasets they reference, facilitating deeper insights and research opportunities within NASA's scientific and data ecosystem. By organizing these interconnections within a graph structure, this dataset enables advanced analyses, such as discovering influential datasets, understanding research trends, and exploring scientific collaborations."

https://pytorch-geometric.readthedocs.io/en/2.6.0/tutorial/shallow_node_embeddings.
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.MetaPath2Vec.html

In [64]:
from torch_geometric.nn import Node2Vec

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=10, context_size=5, walks_per_node=10, num_nodes=edge_index.max().item() + 1).to(device)

print(model)

loader = model.loader(batch_size=32, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

Node2Vec(1225, 128)


In [None]:
for epoch in range(25):
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw, neg_rw)  # negative samples are neg_rw
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
    


IndexError: index out of range in self

: 

In [43]:
#Generate Negative Edges

import numpy as np

num_nodes = data.num_nodes
print(num_nodes)

def gen_neg_edges(negative_edges_goal, edge_index, num_nodes):
    existing_edges = set([tuple(e) for e in edge_index.t().tolist()])
    neg_edges = set()
    while len(neg_edges) < negative_edges_goal:
        u = np.random.randint(0, num_nodes)
        v = np.random.randint(0, num_nodes)
        if u == v: 
            continue  # skip self-loops
        if (u,v) in existing_edges or (v,u) in existing_edges:
            continue
        neg_edges.add((u,v))
    return np.array(list(neg_edges))

val_neg = gen_neg_edges(len(val_links), data.edge_index, num_nodes)
test_neg = gen_neg_edges(len(test_links), data.edge_index, num_nodes)

print(val_neg.size)
print(test_neg.size)
print()
# Combine positive and negative edges
val_pos = val_links[['source','target']].values
val_edges = np.vstack([val_pos, val_neg])
val_labels = np.hstack([np.ones(len(val_pos)), np.zeros(len(val_neg))])

test_pos = test_links[['source','target']].values
test_edges = np.vstack([test_pos, test_neg])
test_labels = np.hstack([np.ones(len(test_pos)), np.zeros(len(test_neg))])

2909
1720
1722



In [44]:
from sklearn.metrics import roc_auc_score

embeddings = model.embedding.weight.data

def edge_score(u, v, emb):
    return (emb[u] * emb[v]).sum().item()  # dot product

print(val_edges.size)
print(test_edges.size)

print("num_nodes:", num_nodes)
print("max edge index:", edge_index.max().item())

val_scores = [edge_score(u, v, embeddings) for u,v in val_edges]
test_scores = [edge_score(u, v, embeddings) for u,v in test_edges]

#Evaluate with AUC
val_auc = roc_auc_score(val_labels, val_scores)
test_auc = roc_auc_score(test_labels, test_scores)

print("Validation AUC:", val_auc)
print("Test AUC:", test_auc)

3440
3444
num_nodes: 2909
max edge index: 2872


IndexError: index 2890 is out of bounds for dimension 0 with size 2873

### Method 2: Alternative Approach 
Task: Choose and implement another link prediction method. ○ Description: This method should not use embeddings. You can use any approach of your choice. Compare the performance of this method with the embedding-based method.

In [None]:
#Non-Embedding Method for link prediction
