# Link Prediction

In [1]:
import os
import numpy as np
import random
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import node2vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from gensim.models import KeyedVectors

## Creating the Datasets

In [2]:
country_net = nx.read_graphml("../networks/graphml/countries_network.graphml")

In [3]:
len(country_net.edges())

2035

Positive Samples

In [68]:
def hide_edges_safely(G, num_to_hide):
    hidden_edges = []
    edges = list(G.edges())
    random.shuffle(edges)
    
    G_train = G.copy()
    for u, v in edges:
        if len(hidden_edges) >= num_to_hide:
            break
        
        # Condition: Only remove if nodes still have other connections
        if G_train.degree(u) > 1 and G_train.degree(v) > 1:
            G_train.remove_edge(u, v)
            # Check if graph is still connected (for undirected)
            if nx.is_weakly_connected(G_train):
                hidden_edges.append((u, v))
            else:
                # Put it back if it disconnected the graph
                G_train.add_edge(u, v)
                
    return G_train, list(G_train.edges()), hidden_edges


In [69]:
G_train, train_pos, test_pos = hide_edges_safely(country_net, 50)

Negative Samples

In [70]:
def get_directed_negative_samples(G, num_samples):
    neg_samples = set()
    nodes = list(G.nodes())
    
    while len(neg_samples) < num_samples:
        u, v = random.sample(nodes, 2)
        # Check if the directed edge exists
        if not G.has_edge(u, v) and (u,v) not in neg_samples:
            neg_samples.add((u, v))
            
    return list(neg_samples)

In [71]:
neg_samples = get_directed_negative_samples(country_net, 2035)

In [72]:
train_neg, test_neg = train_test_split(neg_samples, test_size=50)

Final Data

In [73]:
train_data = []

for u, v in train_pos:
    train_data.append({'source': u, 'target': v, 'label': 1})

for u, v in train_neg:
    train_data.append({'source': u, 'target': v, 'label': 0})

In [74]:
random.shuffle(train_data)
train_df = pd.DataFrame(train_data, columns=["source", "target", "label"])

In [75]:
test_data = []

for u, v in test_pos:
    test_data.append({'source': u, 'target': v, 'label': 1})

for u, v in test_neg:
    test_data.append({'source': u, 'target': v, 'label': 0})

In [76]:
random.shuffle(test_data)
test_df = pd.DataFrame(test_data, columns=["source", "target", "label"])

## Node2Vec Link Prediction

In [84]:
if os.path.exists("node2vec/country_64_50_500_p0_5_q0_1.kv"):
    embeddings = KeyedVectors.load("node2vec/country_64_50_500_p0_5_q0_1.kv")
else:
    # Genereate Walks
    n2v = node2vec.Node2Vec(G_train, dimensions=64, walk_length=50, num_walks=500, p=0.5, q=0.1)
    # Embed nodes
    model = n2v.fit(window=10, min_count=1, batch_words=4)
    embeddings = model.wv
    embeddings.save(f"node2vec/country_64_50_500_p0_5_q0_1.kv")

In [85]:
class LinkPredictionDataset(Dataset):
    def __init__(self, dataframe, embeddings):
        self.labels = dataframe['label'].values
        # Pre-fetch embeddings for efficiency
        sources = [embeddings[str(u)] for u in dataframe['source']]
        targets = [embeddings[str(v)] for v in dataframe['target']]
        
        self.X = np.hstack([sources, targets]) # Concatenate u and v
        self.X = torch.tensor(self.X, dtype=torch.float32)
        self.y = torch.tensor(self.labels, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [86]:
train_dataset = LinkPredictionDataset(train_df, embeddings)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

In [87]:
class N2V_DirectedEdgePredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        # input_dim is the size of one node embedding (e.g., 64 or 128)
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), # input_dim * 2 due to concatenation
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1), # Output layer size 1
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.net(x) # Returns logits if using BCEWithLogitsLoss


In [88]:
# Detect GPU (CUDA) or Apple Silicon (MPS), otherwise fallback to CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available(): # For Mac M1/M2/M3 users in 2025
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


In [89]:
input_dim = next(iter(embeddings)).shape[0] * 2
model = N2V_DirectedEdgePredictor(input_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [90]:
# Training Loop
for epoch in range(1, 21):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_loader:
        # MOVE DATA TO GPU HERE
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 5, Loss: 0.0673
Epoch 10, Loss: 0.0110
Epoch 15, Loss: 0.0062
Epoch 20, Loss: 0.0035


#### Predictions on Training Set

In [91]:
model.eval() # Set model to evaluation mode
y_true = []
y_pred_probs = []

with torch.no_grad(): # Disable gradient calculation during evaluation
    for batch_x, batch_y in train_loader:
        # Move data to the appropriate device (CPU for this part if you prefer)
        batch_x = batch_x.to(device) 
        batch_y = batch_y.to('cpu')

        outputs = model(batch_x).to('cpu')
        y_true.extend(batch_y.numpy().flatten())
        y_pred_probs.extend(outputs.numpy().flatten())

# Convert probabilities to binary predictions (e.g., threshold at 0.5)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred_probs]

In [92]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}\n")

# Standard Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")
print(f"Micro F1 Score: {f1_micro:.4f}")

# AUC (Area Under the Curve)
# AUC requires the raw probabilities (y_pred_probs)
auc_score = roc_auc_score(y_true, y_pred_probs)
print(f"AUC Score: {auc_score:.4f}")

Confusion Matrix:
[[1985    0]
 [   0 1985]]

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
Macro F1 Score: 1.0000
Micro F1 Score: 1.0000
AUC Score: 1.0000


#### Predictions on Test Set

In [93]:
test_dataset = LinkPredictionDataset(test_df, embeddings)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True, pin_memory=True)

In [94]:
model.eval() # Set model to evaluation mode
y_true = []
y_pred_probs = []

with torch.no_grad(): # Disable gradient calculation during evaluation
    for batch_x, batch_y in test_loader:
        # Move data to the appropriate device (CPU for this part if you prefer)
        batch_x = batch_x.to(device) 
        batch_y = batch_y.to('cpu')

        outputs = model(batch_x).to('cpu')
        y_true.extend(batch_y.numpy().flatten())
        y_pred_probs.extend(outputs.numpy().flatten())

# Convert probabilities to binary predictions (e.g., threshold at 0.5)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred_probs]

In [95]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}\n")

# Standard Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")
print(f"Micro F1 Score: {f1_micro:.4f}")

# AUC (Area Under the Curve)
# AUC requires the raw probabilities (y_pred_probs)
auc_score = roc_auc_score(y_true, y_pred_probs)
print(f"AUC Score: {auc_score:.4f}")

Confusion Matrix:
[[50  0]
 [ 1 49]]

Accuracy: 0.9900
Precision: 1.0000
Recall: 0.9800
Macro F1 Score: 0.9900
Micro F1 Score: 0.9900
AUC Score: 0.9996


In [96]:
def get_link_pred(source, target):
    emb_source = embeddings[source]
    emb_target = embeddings[target]
    source_target = np.hstack([emb_source, emb_target])
    target_source = np.hstack([emb_target, emb_source])
    source_target_input = torch.tensor(source_target, dtype=torch.float32).unsqueeze(0) # Add batch dimension
    target_source_input = torch.tensor(target_source, dtype=torch.float32).unsqueeze(0) # Add batch dimension
    source_target_input = source_target_input.to(device)
    target_source_input = target_source_input.to(device)
    model.eval()
    with torch.no_grad():
        return model(source_target_input).item(), model(target_source_input).item()

In [97]:
source = "China"
target = "Austria"

source_target_prob, target_source_prob = get_link_pred(source, target)

print(f"Probability of a directed edge from {source} to {target}: {source_target_prob:.4f}")
print(f"Probability of a directed edge from {target} to {source}: {target_source_prob:.4f}")

# Threshold check (standard is 0.5)
if source_target_prob >= 0.5:
    print(f"Prediction: Edge {source} -> {target} EXISTS")
else:
    print(f"Prediction: Edge {source} -> {target} DOESN'T EXIST")

if target_source_prob >= 0.5:
    print(f"Prediction: Edge {target} -> {source} EXISTS")
else:
    print(f"Prediction: Edge {target} -> {source} DOESN'T EXIST")

Probability of a directed edge from China to Austria: 0.9999
Probability of a directed edge from Austria to China: 0.0000
Prediction: Edge China -> Austria EXISTS
Prediction: Edge Austria -> China DOESN'T EXIST


In [100]:
# Define the saving path
SAVE_PATH = "node2vec/n2v_predictor_full.pth"

# Prepare the checkpoint
checkpoint = {
    'model_state_dict': model.state_dict(),
    'input_dim': 128,              # The size used in __init__ (e.g., 2 * embedding_size)
    'embeddings': embeddings, # The dictionary/matrix of Node2Vec vectors
}

torch.save(checkpoint, SAVE_PATH)
print("Node2Vec Predictor saved.")


Node2Vec Predictor saved.


## GNN Link Prediction

In [4]:
# PyTorch geometric
import torch_geometric
from torch_geometric.utils import from_networkx
from torch_geometric.nn import SAGEConv
from torch_geometric.transforms import RandomLinkSplit

#### Data Preparation

In [7]:
G = nx.DiGraph()
G.add_edges_from(country_net.edges())

for node in G.nodes():
    emb_start = np.zeros(26, dtype=np.float32)
    emb_end = np.zeros(26, dtype=np.float32)
    
    emb_start[ord(node[0].lower()) - ord('a')] = 1
    emb_end[ord(node[-1].lower()) - ord('a')] = 1

    emb = np.hstack((emb_start, emb_end))

    G.nodes[node]['embedding'] = torch.tensor(emb)

In [8]:
# Create a mapping: Name -> Index
node_to_idx = {node: i for i, node in enumerate(G.nodes())}
data = from_networkx(G, group_node_attrs=['embedding'])

In [9]:
# Random Link Split
transform = RandomLinkSplit(
    num_val=0., 
    num_test=0.25, 
    is_undirected=False, 
    add_negative_train_samples=True
)

train_data, val_data, test_data = transform(data)

In [10]:
class GNN_DirectEdgePredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        # --- Encoder: Two-layer GNN ---
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        
        # --- Decoder: MLP for Directed Link Prediction ---
        # input is out_channels * 2 because we concatenate source and target
        self.decoder = nn.Sequential(
            nn.Linear(out_channels * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        # edge_label_index contains pairs of nodes (u, v) to predict
        src = z[edge_label_index[0]]
        dst = z[edge_label_index[1]]
        
        # Concatenate u and v to preserve directionality
        edge_feat = torch.cat([src, dst], dim=-1)
        return self.decoder(edge_feat).view(-1)


In [49]:
device = torch.device('cpu')

in_channels = data.num_features
hidden_channels = 128
out_channels = 64
model = GNN_DirectEdgePredictor(in_channels=data.num_features, hidden_channels=128, out_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.BCELoss()

In [53]:
# Training Loop
for epoch in range(1, 51):
    model.train()
    optimizer.zero_grad()
    
    # Generate node embeddings (using message-passing edges)
    z = model.encode(train_data.x.to(device), train_data.edge_index.to(device))
    
    # Predict labels for specific node pairs (supervision edges)
    out = model.decode(z, train_data.edge_label_index.to(device))
    
    loss = criterion(out, train_data.edge_label.to(device))
    loss.backward()
    optimizer.step()

    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()/len(train_data.edge_label):.4f}")

Epoch 5, Loss: 0.0001
Epoch 10, Loss: 0.0001
Epoch 15, Loss: 0.0000
Epoch 20, Loss: 0.0000
Epoch 25, Loss: 0.0000
Epoch 30, Loss: 0.0000
Epoch 35, Loss: 0.0000
Epoch 40, Loss: 0.0000
Epoch 45, Loss: 0.0000
Epoch 50, Loss: 0.0000


#### Predictions on Train Set

In [54]:
model.eval()
with torch.no_grad():
    z = model.encode(train_data.x.to(device), train_data.edge_index.to(device))
    out = model.decode(z, train_data.edge_label_index.to(device))

y_true = train_data.edge_label.cpu().numpy()
y_pred_probs = out.cpu().numpy()
y_pred = (y_pred_probs > 0.5).astype(int)

In [55]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}\n")

# Standard Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")
print(f"Micro F1 Score: {f1_micro:.4f}")

# AUC (Area Under the Curve)
# AUC requires the raw probabilities (y_pred_probs)
auc_score = roc_auc_score(y_true, y_pred_probs)
print(f"AUC Score: {auc_score:.4f}")

Confusion Matrix:
[[1527    0]
 [   0 1527]]

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
Macro F1 Score: 1.0000
Micro F1 Score: 1.0000
AUC Score: 1.0000


#### Predictions on Test Set

In [56]:
model.eval()
with torch.no_grad():
    z = model.encode(test_data.x.to(device), test_data.edge_index.to(device))
    out = model.decode(z, test_data.edge_label_index.to(device))

y_true = test_data.edge_label.cpu().numpy()
y_pred_probs = out.cpu().numpy()
y_pred = (y_pred_probs > 0.5).astype(int)

In [57]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}\n")

# Standard Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")
print(f"Micro F1 Score: {f1_micro:.4f}")

# AUC (Area Under the Curve)
# AUC requires the raw probabilities (y_pred_probs)
auc_score = roc_auc_score(y_true, y_pred_probs)
print(f"AUC Score: {auc_score:.4f}")

Confusion Matrix:
[[493  15]
 [  3 505]]

Accuracy: 0.9823
Precision: 0.9712
Recall: 0.9941
Macro F1 Score: 0.9823
Micro F1 Score: 0.9823
AUC Score: 0.9981


In [64]:
def gnn_predict_directed_edge(source_country, target_country, device='cpu'):
    """
    Predicts the probability of a directed edge from source_country to target_country.
    """
    model.eval()
    
    with torch.no_grad():
        # 1. Map country names to integer indices
        try:
            u_idx = node_to_idx[source_country]
            v_idx = node_to_idx[target_country]
        except KeyError as e:
            return f"Error: Country {e} not found in the graph."

        # 2. Generate current node embeddings using the GNN
        # We use the full graph's features and message-passing edges
        z = model.encode(data.x.to(device), data.edge_index.to(device))
        
        # 3. Extract specific embeddings for u and v
        z_u = z[u_idx].unsqueeze(0) # Shape [1, embedding_dim]
        z_v = z[v_idx].unsqueeze(0)
        
        # 4. Decode and Convert Logits to Probability (0 to 1)
        source_target_prob = model.decode([z_u, z_v], (0,1)).item()
        target_source_prob = model.decode([z_v, z_u], (0,1)).item()
        
        # 6. Return the probability of the edge
        return source_target_prob, target_source_prob


In [65]:
source = "China"
target = "Austria"

source_target_prob, target_source_prob = gnn_predict_directed_edge(source, target)

print(f"Probability of a directed edge from {source} to {target}: {source_target_prob:.4f}")
print(f"Probability of a directed edge from {target} to {source}: {target_source_prob:.4f}")

# Threshold check (standard is 0.5)
if source_target_prob >= 0.5:
    print(f"Prediction: Edge {source} -> {target} EXISTS")
else:
    print(f"Prediction: Edge {source} -> {target} DOESN'T EXIST")

if target_source_prob >= 0.5:
    print(f"Prediction: Edge {target} -> {source} EXISTS")
else:
    print(f"Prediction: Edge {target} -> {source} DOESN'T EXIST")

Probability of a directed edge from China to Austria: 1.0000
Probability of a directed edge from Austria to China: 0.0000
Prediction: Edge China -> Austria EXISTS
Prediction: Edge Austria -> China DOESN'T EXIST


In [66]:
SAVE_PATH = "gnn_model/gnn_direct_edge_predictor.pth"

checkpoint = {
    'model_state_dict': model.state_dict(),
    'in_channels': 52,       # Replace with your actual value
    'hidden_channels': 128,    # Replace with your actual value
    'out_channels': 64,       # Replace with your actual value
    'node_to_idx': node_to_idx # Highly recommended: save your string-to-ID mapping
}

# 3. Save to disk
torch.save(checkpoint, SAVE_PATH)
print(f"Model saved to {SAVE_PATH}")

Model saved to gnn_model/gnn_direct_edge_predictor.pth


---