# Homework 1

## Part 1

In [29]:
import torch
from torch_geometric.data import Data
import pandas as pd


In [39]:
#import nodes and edges
node_df = pd.read_csv('data/nodes.csv')
edge_df = pd.read_csv('data/train_edges.csv')
val_links = pd.read_csv('data/val_links.csv')
test_links = pd.read_csv('data/test_links.csv')

#print(node_df.head())
#print(node_df['label'].unique())
#print(edge_df.head())
#print(edge_df['relationship_type'].unique())

#node_df["properties"] = node_df["properties"].apply(ast.literal_eval)
#print(type(node_df['properties'].iloc[0]))

#drop properties column because won't be used
node_df = node_df.drop('properties', axis=1)
#remove quotes and brackets so labels are simpler
node_df['label'] = node_df['label'].str.replace("['", "")
node_df['label'] = node_df['label'].str.replace("']", "")
print(node_df.columns)
print(node_df.head())
print()
print(node_df['label'].value_counts())

print()
print(edge_df['relationship_type'].value_counts())

print()
print("Number of training examples: ", len(edge_df))
print("Number of validation examples: ", len(val_links))
print("Number of test examples: ", len(test_links))


Index(['id', 'label'], dtype='object')
   id    label
0   0  Dataset
1   1  Dataset
2   2  Dataset
3   3  Dataset
4   4  Dataset

label
Publication       2584
ScienceKeyword    1609
Dataset           1300
Platform           142
Instrument          83
Project             44
DataCenter           1
Name: count, dtype: int64

relationship_type
HAS_SCIENCEKEYWORD    4015
USES_DATASET          3623
SUBCATEGORY_OF        1823
HAS_PLATFORM          1519
OF_PROJECT            1325
HAS_DATASET           1300
HAS_INSTRUMENT         215
Name: count, dtype: int64

Number of training examples:  13820
Number of validation examples:  860
Number of test examples:  861


**Important: - `val_links.csv`: Contains `HAS_SCIENCEKEYWORD` edges for validation.`test_links.csv`: Contains `HAS_SCIENCEKEYWORD` edges for testing.** <br>
So, the validation and test sets are only looking at relationship type Dataset -> ScienceKeyword. 

In [40]:
# Map node IDs to contiguous indices for PyG
node_df['idx'] = range(len(node_df))
id_to_idx = dict(zip(node_df['id'], node_df['idx']))

ones = torch.ones((len(node_df), 1))  # dummy node features for each node
# Map node IDs to indices
edge_df['source'] = edge_df['source'].map(id_to_idx)
edge_df['target'] = edge_df['target'].map(id_to_idx)

val_links['source'] = val_links['source'].map(id_to_idx)
val_links['target'] = val_links['target'].map(id_to_idx)

test_links['source'] = test_links['source'].map(id_to_idx)
test_links['target'] = test_links['target'].map(id_to_idx)


# Create edge_index tensor
edge_index = torch.tensor(edge_df[['source', 'target']].values, dtype=torch.long).t().contiguous()

val_index = torch.tensor(val_links[['source', 'target']].values, dtype=torch.long).t().contiguous()
test_index = torch.tensor(test_links[['source', 'target']].values, dtype=torch.long).t().contiguous()
#shape must be (2 , num_edges)
print(val_index.size())
print(test_index.size())


data = Data(x=ones, edge_index=edge_index)
print(data)

torch.Size([2, 860])
torch.Size([2, 861])
Data(x=[5763, 1], edge_index=[2, 13820])


In [41]:
#Summary of Dataset Structure and Key Statistics for Part 1
print("Number of Nodes: ", data.num_nodes)
print("Number of Node features: ", data.num_node_features)
print()
print("Number of Edges: ", data.num_edges)
print()
print("Has Isolated Nodes: " , data.has_isolated_nodes())
print("Has Self Loops: ", data.has_self_loops())

Number of Nodes:  5763
Number of Node features:  1

Number of Edges:  13820

Has Isolated Nodes:  False
Has Self Loops:  False


## Part 2: Link Prediction
### Method #1: Embedding-Based Approach

Task: Apply an embedding-based method for link prediction. ○ Description: Train a model that generates node embeddings, then use those embeddings to predict links. Print relevant metrics.

"The primary goal of the NASA Knowledge Graph is to bridge scientific publications with the datasets they reference, facilitating deeper insights and research opportunities within NASA's scientific and data ecosystem. By organizing these interconnections within a graph structure, this dataset enables advanced analyses, such as discovering influential datasets, understanding research trends, and exploring scientific collaborations."

https://pytorch-geometric.readthedocs.io/en/2.6.0/tutorial/shallow_node_embeddings.
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.MetaPath2Vec.html

In [42]:
from torch_geometric.nn import Node2Vec

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=10, context_size=5, walks_per_node=10).to(device)

print(model)

loader = model.loader(batch_size=32, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

Node2Vec(5763, 128)


In [43]:
for epoch in range(50):
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw, neg_rw)  # negative samples are neg_rw
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
    


Epoch 1, Loss: 1007.7121
Epoch 2, Loss: 734.0953
Epoch 3, Loss: 536.2362
Epoch 4, Loss: 399.0610
Epoch 5, Loss: 308.6581
Epoch 6, Loss: 247.1568
Epoch 7, Loss: 206.3326
Epoch 8, Loss: 180.2015
Epoch 9, Loss: 163.8000
Epoch 10, Loss: 154.0092
Epoch 11, Loss: 147.8088
Epoch 12, Loss: 144.0666
Epoch 13, Loss: 141.4387
Epoch 14, Loss: 139.9338
Epoch 15, Loss: 138.8192
Epoch 16, Loss: 138.2774
Epoch 17, Loss: 137.7419
Epoch 18, Loss: 137.4934
Epoch 19, Loss: 137.3682
Epoch 20, Loss: 137.3345
Epoch 21, Loss: 137.2834
Epoch 22, Loss: 137.2292
Epoch 23, Loss: 137.4433
Epoch 24, Loss: 137.5447
Epoch 25, Loss: 137.5329
Epoch 26, Loss: 137.7108
Epoch 27, Loss: 137.6426
Epoch 28, Loss: 137.6790
Epoch 29, Loss: 137.8137
Epoch 30, Loss: 137.7188
Epoch 31, Loss: 137.8506
Epoch 32, Loss: 137.8430
Epoch 33, Loss: 137.8541
Epoch 34, Loss: 137.7113
Epoch 35, Loss: 137.7094
Epoch 36, Loss: 137.7095
Epoch 37, Loss: 137.6874
Epoch 38, Loss: 137.6003
Epoch 39, Loss: 137.5961
Epoch 40, Loss: 137.7175
Epoch 41

In [50]:
from torch_geometric.utils import negative_sampling

print("val_index.size(1): ", val_index.size(1))
print("test_index.size(1): ", test_index.size(1))
print("model.size(0): ", model.num_nodes)

neg_val_index = negative_sampling(
    edge_index=data.edge_index, 
    num_nodes=model.num_nodes,
    num_neg_samples=val_index.size(1)
)

neg_test_index = negative_sampling(
    edge_index=data.edge_index,
    num_nodes=model.num_nodes,
    num_neg_samples=test_index.size(1)
)

print(neg_val_index)
print("neg_val_edge_index.shape: ", neg_val_index.shape)
print(neg_test_index)
print("neg_test_edge_index.shape: ", neg_val_index.shape)


val_index.size(1):  860
test_index.size(1):  861
model.size(0):  5763
tensor([[4485, 4224, 3492,  ..., 1521, 5278, 3069],
        [4587, 1524,   11,  ..., 1206, 1551, 5461]])
neg_val_edge_index.shape:  torch.Size([2, 860])
tensor([[1019, 4078, 4482,  ..., 4131, 4606, 3181],
        [5360, 3467, 2482,  ...,  818, 5596, 5306]])
neg_test_edge_index.shape:  torch.Size([2, 860])


In [13]:
#Generate Negative Edges

import numpy as np

num_nodes = data.num_nodes

def gen_neg_edges(negative_edges_goal, edge_index, num_nodes):
    existing_edges = set([tuple(e) for e in edge_index.t().tolist()])
    neg_edges = set()
    while len(neg_edges) < negative_edges_goal:
        u = np.random.randint(0, num_nodes)
        v = np.random.randint(0, num_nodes)
        if u == v: 
            continue  # skip self-loops
        if (u,v) in existing_edges or (v,u) in existing_edges:
            continue
        neg_edges.add((u,v))
    return np.array(list(neg_edges))

val_neg = gen_neg_edges(len(val_links), data.edge_index, num_nodes)
test_neg = gen_neg_edges(len(test_links), data.edge_index, num_nodes)

# Combine positive and negative edges
val_pos = val_links[['source','target']].values
val_edges = np.vstack([val_pos, val_neg])
val_labels = np.hstack([np.ones(len(val_pos)), np.zeros(len(val_neg))])

test_pos = test_links[['source','target']].values
test_edges = np.vstack([test_pos, test_neg])
test_labels = np.hstack([np.ones(len(test_pos)), np.zeros(len(test_neg))])

In [96]:
embeddings = model()
#gets actual link predictions 
def predict_links(node_embeddings, edge_index):
    src, dst = edge_index
    score = (node_embeddings[src] * node_embeddings[dst]).sum(dim=1)  # dot product of two node embeddings
    #print(type(score))
    #print("Score: ", score)
    prob = torch.sigmoid(score) #make prob between 0 and 1
    threshold = 0.7
    pred = (prob > threshold).int()
    return pred

#gets scores from dot product of two node embeddings
def get_scores(node_embeddings, edge_index):
    src, dst = edge_index
    scores = (node_embeddings[src] * node_embeddings[dst]).sum(dim=1)  # dot product of two node embeddings
    return scores


#assemble true label tensors
val_true_labels = torch.cat([torch.ones(val_index.size(1)), torch.zeros(neg_val_index.size(1))])
test_true_labels = torch.cat([torch.ones(test_index.size(1)), torch.zeros(neg_test_index.size(1))])
print("val_true_labels.shape: " , val_true_labels.size())
print("test_true_labels.shape: ", test_true_labels.size())

#get presence of link predictions for validation and test sets
pos_val_pred = predict_links(embeddings, val_index)
neg_val_pred = predict_links(embeddings, neg_val_index)
#print(val_true_labels)
print("pos_val_pred.shape: ", pos_val_pred.shape)
print("neg_val_pred.shape: ", neg_val_pred.shape)
pos_test_pred = predict_links(embeddings, test_index)
neg_test_pred = predict_links(embeddings, neg_test_index)

#assemble predicted label tensors
val_pred_labels = torch.cat([pos_val_pred, neg_val_pred])
test_pred_labels = torch.cat([pos_test_pred, neg_test_pred])

#get score predictions for validation and test sets
pos_val_scores = get_scores(embeddings, val_index)
neg_val_scores = get_scores(embeddings, neg_val_index)

pos_test_scores = get_scores(embeddings, test_index)
neg_test_scores = get_scores(embeddings, neg_test_index)

#assemble validation and test score tensors
val_scores = torch.cat([pos_val_scores, neg_val_scores])
test_scores = torch.cat([pos_test_scores, neg_test_scores])



val_true_labels.shape:  torch.Size([1720])
test_true_labels.shape:  torch.Size([1722])
pos_val_pred.shape:  torch.Size([860])
neg_val_pred.shape:  torch.Size([860])


In [100]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
validation_accuracy = accuracy_score(val_true_labels, val_pred_labels)
test_accuracy = accuracy_score(test_true_labels, test_pred_labels)
print("Validation Accuracy: ", validation_accuracy)
print("Test Accuracy: ", test_accuracy)
print()
val_roc_auc = roc_auc_score(val_true_labels.cpu(), val_scores.detach().numpy())
test_roc_auc = roc_auc_score(test_true_labels.cpu(), test_scores.detach().numpy())
val_avg_prec = average_precision_score(val_true_labels.cpu(), val_scores.detach().numpy())
test_avg_prec = average_precision_score(test_true_labels.cpu(), test_scores.detach().numpy())
print(f"Validation ROC AUC: {val_roc_auc:.4f}, Val Avg Precision: {val_avg_prec:.4f}")
print(f"Test ROC AUC: {test_roc_auc:.4f}, Test Avg Precision: {test_avg_prec:.4f}")

Validation Accuracy:  0.702906976744186
Test Accuracy:  0.7049941927990708

Validation ROC AUC: 0.6544, Val Avg Precision: 0.7654
Test ROC AUC: 0.6642, Test Avg Precision: 0.7555


Accuracy in predicting the presence of an edge on was about 70% in the validation and test sets using the dot products of two nodee embeddings to get a score which was then passed through a sigmoid() function to get a probability between 0 and 1. Testing different thresholds, the best performance was found at a threshold of about 0.7. 

### Method 2: Alternative Approach 
Task: Choose and implement another link prediction method. ○ Description: This method should not use embeddings. You can use any approach of your choice. Compare the performance of this method with the embedding-based method.

Going to use Jaccard's Coefficient to measure local neighborhood overlap and predict links between Datasets that have ScienceKeywords that don't exist in the graph. Rather than using common neighbors which doesn't consider the size of the neighborhood set, Jaccard's coefficient normalizes for node degree; therefore, having more mutual links relative to the total number of unique links will be more influential in determining whether there is a link. <br>
For example, consider a Dataset (node A) that connects to 10 different science keywords. If you have a science keyword (node B) that is a subcategory of many other ScienceKeywords that this Dataset connects with, it is likely that the Dataset (node A) should be connected with node (node B).

https://networkx.org/documentation/stable/reference/classes/graph.html#networkx.Graph <br>
https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html <br>
https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.jaccard_coefficient.html

In [None]:
#Non-Embedding Method for link prediction
import pandas as pd
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

#print(edge_df.head())
G = nx.from_pandas_edgelist(edge_df, source='source', target='target', edge_attr="relationship_type", create_using=nx.Graph())

#validation edges 
#print(val_links.head())

def compute_jaccard(G, edges):
    scores = []
    for u, v in edges:
        try:
            preds = list(nx.jaccard_coefficient(G, [(u, v)]))
            scores.append(preds[0][2])
        except nx.NetworkXError:
            scores.append(0.0)
    return np.array(scores)

#Use val_edges, test_edges, val_labels and test_labels from earlier negative sampling
val_scores = compute_jaccard(G, val_edges)
test_scores = compute_jaccard(G, test_edges)

print(test_scores[0])

val_auc = roc_auc_score(val_labels, val_scores)
val_precision = average_precision_score(val_labels, val_scores)
print(val_labels)
val_acc = accuracy_score(val_labels, val_scores)
test_acc = accuracy_score(test_labels, test_scores)
test_auc = roc_auc_score(test_labels, test_scores)
test_precision = average_precision_score(test_labels, test_scores)

print(f"Validation  AUC: {val_auc:.4f}, AP: {val_precision:.4f}")
print(f"Test        AUC: {test_auc:.4f}, AP: {test_precision:.4f}")

0.014705882352941176
[1. 1. 1. ... 0. 0. 0.]


ValueError: Classification metrics can't handle a mix of binary and continuous targets