# Link Prediction

In [50]:
import numpy as np
import random
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import node2vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

## Creating the Datasets

In [2]:
country_net = nx.read_graphml("../networks/graphml/countries_network.graphml")

In [3]:
len(country_net.edges())

2035

Positive Samples

In [4]:
def hide_edges_safely(G, num_to_hide):
    hidden_edges = []
    edges = list(G.edges())
    random.shuffle(edges)
    
    G_train = G.copy()
    for u, v in edges:
        if len(hidden_edges) >= num_to_hide:
            break
        
        # Condition: Only remove if nodes still have other connections
        if G_train.degree(u) > 1 and G_train.degree(v) > 1:
            G_train.remove_edge(u, v)
            # Check if graph is still connected (for undirected)
            if nx.is_weakly_connected(G_train):
                hidden_edges.append((u, v))
            else:
                # Put it back if it disconnected the graph
                G_train.add_edge(u, v)
                
    return G_train, list(G_train.edges()), hidden_edges


In [5]:
G_train, train_pos, test_pos = hide_edges_safely(country_net, 50)

Negative Samples

In [62]:
def get_directed_negative_samples(G, num_samples):
    neg_samples = set()
    nodes = list(G.nodes())
    
    while len(neg_samples) < num_samples:
        u, v = random.sample(nodes, 2)
        # Check if the directed edge exists
        if not G.has_edge(u, v) and (u,v) not in neg_samples:
            neg_samples.add((u, v))
            
    return list(neg_samples)

In [63]:
neg_samples = get_directed_negative_samples(country_net, 2035)

In [64]:
train_neg, test_neg = train_test_split(neg_samples, test_size=50)

Final Data

In [66]:
train_data = []

for u, v in train_pos:
    train_data.append({'source': u, 'target': v, 'label': 1})

for u, v in train_neg:
    train_data.append({'source': u, 'target': v, 'label': 0})

In [67]:
random.shuffle(train_data)
train_df = pd.DataFrame(train_data, columns=["source", "target", "label"])

In [68]:
test_data = []

for u, v in test_pos:
    test_data.append({'source': u, 'target': v, 'label': 1})

for u, v in test_neg:
    test_data.append({'source': u, 'target': v, 'label': 0})

In [69]:
random.shuffle(test_data)
test_df = pd.DataFrame(test_data, columns=["source", "target", "label"])

## Node2Vec Link Prediction

In [70]:
# Genereate Walks
n2v = node2vec.Node2Vec(G_train, dimensions=64, walk_length=50, num_walks=500, p=0.5, q=0.1)
# Embed nodes
model = n2v.fit(window=10, min_count=1, batch_words=4)

Computing transition probabilities:   0%|          | 0/195 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 500/500 [00:36<00:00, 13.81it/s]


In [71]:
embeddings = model.wv

In [72]:
class LinkPredictionDataset(Dataset):
    def __init__(self, dataframe, embeddings):
        self.labels = dataframe['label'].values
        # Pre-fetch embeddings for efficiency
        sources = [embeddings[str(u)] for u in dataframe['source']]
        targets = [embeddings[str(v)] for v in dataframe['target']]
        
        self.X = np.hstack([sources, targets]) # Concatenate u and v
        self.X = torch.tensor(self.X, dtype=torch.float32)
        self.y = torch.tensor(self.labels, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [73]:
train_dataset = LinkPredictionDataset(train_df, embeddings)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

In [74]:
class N2V_DirectedEdgePredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        # input_dim is the size of one node embedding (e.g., 64 or 128)
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), # input_dim * 2 due to concatenation
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1), # Output layer size 1
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.net(x) # Returns logits if using BCEWithLogitsLoss


In [75]:
# Detect GPU (CUDA) or Apple Silicon (MPS), otherwise fallback to CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available(): # For Mac M1/M2/M3 users in 2025
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


In [76]:
input_dim = next(iter(embeddings)).shape[0] * 2
model = N2V_DirectedEdgePredictor(input_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [77]:
# Training Loop
for epoch in range(1, 21):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_loader:
        # MOVE DATA TO GPU HERE
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 5, Loss: 0.0630
Epoch 10, Loss: 0.0081
Epoch 15, Loss: 0.0044
Epoch 20, Loss: 0.0028


#### Predictions on Training Set

In [78]:
model.eval() # Set model to evaluation mode
y_true = []
y_pred_probs = []

with torch.no_grad(): # Disable gradient calculation during evaluation
    for batch_x, batch_y in train_loader:
        # Move data to the appropriate device (CPU for this part if you prefer)
        batch_x = batch_x.to(device) 
        batch_y = batch_y.to('cpu')

        outputs = model(batch_x).to('cpu')
        y_true.extend(batch_y.numpy().flatten())
        y_pred_probs.extend(outputs.numpy().flatten())

# Convert probabilities to binary predictions (e.g., threshold at 0.5)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred_probs]

In [79]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}\n")

# Standard Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")
print(f"Micro F1 Score: {f1_micro:.4f}")

# AUC (Area Under the Curve)
# AUC requires the raw probabilities (y_pred_probs)
auc_score = roc_auc_score(y_true, y_pred_probs)
print(f"AUC Score: {auc_score:.4f}")

Confusion Matrix:
[[1985    0]
 [   1 1984]]

Accuracy: 0.9997
Precision: 1.0000
Recall: 0.9995
Macro F1 Score: 0.9997
Micro F1 Score: 0.9997
AUC Score: 1.0000


#### Predictions on Test Set

In [80]:
test_dataset = LinkPredictionDataset(test_df, embeddings)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True, pin_memory=True)

In [81]:
model.eval() # Set model to evaluation mode
y_true = []
y_pred_probs = []

with torch.no_grad(): # Disable gradient calculation during evaluation
    for batch_x, batch_y in test_loader:
        # Move data to the appropriate device (CPU for this part if you prefer)
        batch_x = batch_x.to(device) 
        batch_y = batch_y.to('cpu')

        outputs = model(batch_x).to('cpu')
        y_true.extend(batch_y.numpy().flatten())
        y_pred_probs.extend(outputs.numpy().flatten())

# Convert probabilities to binary predictions (e.g., threshold at 0.5)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred_probs]

In [82]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}\n")

# Standard Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")
print(f"Micro F1 Score: {f1_micro:.4f}")

# AUC (Area Under the Curve)
# AUC requires the raw probabilities (y_pred_probs)
auc_score = roc_auc_score(y_true, y_pred_probs)
print(f"AUC Score: {auc_score:.4f}")

Confusion Matrix:
[[50  0]
 [ 0 50]]

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
Macro F1 Score: 1.0000
Micro F1 Score: 1.0000
AUC Score: 1.0000


In [92]:
def get_link_pred(source, target):
    emb_source = embeddings[source]
    emb_target = embeddings[target]
    source_target = np.hstack([emb_source, emb_target])
    target_source = np.hstack([emb_target, emb_source])
    source_target_input = torch.tensor(source_target, dtype=torch.float32).unsqueeze(0) # Add batch dimension
    target_source_input = torch.tensor(target_source, dtype=torch.float32).unsqueeze(0) # Add batch dimension
    source_target_input = source_target_input.to(device)
    target_source_input = target_source_input.to(device)
    model.eval()
    with torch.no_grad():
        return model(source_target_input).item(), model(target_source_input).item()

In [97]:
source = "China"
target = "Austria"

source_target_prob, target_source_prob = get_link_pred(source, target)

print(f"Probability of a directed edge from {source} to {target}: {source_target_prob:.4f}")
print(f"Probability of a directed edge from {target} to {source}: {target_source_prob:.4f}")

# Threshold check (standard is 0.5)
if source_target_prob >= 0.5:
    print(f"Prediction: Edge {source} -> {target} EXISTS")
else:
    print(f"Prediction: Edge {source} -> {target} DOESN'T EXIST")

if target_source_prob >= 0.5:
    print(f"Prediction: Edge {target} -> {source} EXISTS")
else:
    print(f"Prediction: Edge {target} -> {source} DOESN'T EXIST")

Probability of a directed edge from China to Austria: 0.9998
Probability of a directed edge from Austria to China: 0.0000
Prediction: Edge China -> Austria EXISTS
Prediction: Edge Austria -> China DOESN'T EXIST


## GNN Link Prediction

In [98]:
# PyTorch geometric
import torch_geometric
import torch_geometric.data as geom_data
import torch_geometric.nn as geom_nn

ModuleNotFoundError: No module named 'torch_geometric'