# Preparing your data to work with PyG

In [1]:
import torch
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric import utils

  from .autonotebook import tqdm as notebook_tqdm


## NX Social Graph

In [3]:
import networkx as nx

# Create a simple social graph
social_graph = nx.Graph()

# Add nodes (people)
social_graph.add_nodes_from(['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'])

# Add edges (friendships)
social_graph.add_edge('Alice', 'Bob')
social_graph.add_edge('Alice', 'Charlie')
social_graph.add_edge('Bob', 'Charlie')
social_graph.add_edge('Bob', 'Diana')
social_graph.add_edge('Charlie', 'Eve')
social_graph.add_edge('Diana', 'Eve')

print(f"Number of nodes: {social_graph.number_of_nodes()}")
print(f"Number of edges: {social_graph.number_of_edges()}")
print(f"Nodes: {list(social_graph.nodes())}")
print(f"Edges: {list(social_graph.edges())}")


Number of nodes: 5
Number of edges: 6
Nodes: ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve']
Edges: [('Alice', 'Bob'), ('Alice', 'Charlie'), ('Bob', 'Charlie'), ('Bob', 'Diana'), ('Charlie', 'Eve'), ('Diana', 'Eve')]


## From NX object to PyG object

In [5]:
data = utils.from_networkx(social_graph)
data

Data(edge_index=[2, 12], num_nodes=5)

## From raw data to PyG object

In [11]:
social_graph = nx.read_edgelist('edge_list2.txt')
print(social_graph)
list_of_nodes = list(set(list(social_graph)))
indices_of_nodes = [list_of_nodes.index(x)\
 for x in list_of_nodes]
print(indices_of_nodes)
node_to_index = dict(zip(list_of_nodes, indices_of_nodes))
index_to_node = dict(zip(indices_of_nodes, list_of_nodes))
print(node_to_index)
print(index_to_node)

Graph with 7 nodes and 10 edges
[0, 1, 2, 3, 4, 5, 6]
{'Bob': 0, 'Eve': 1, 'Charlie': 2, 'Diana': 3, 'Frank': 4, 'Alice': 5, 'Gina': 6}
{0: 'Bob', 1: 'Eve', 2: 'Charlie', 3: 'Diana', 4: 'Frank', 5: 'Alice', 6: 'Gina'}


In [17]:
list_edges = nx.convert.to_edgelist(social_graph)
list_edges = list(list_edges)
named_edge_list_0 = [x[0] for x in list_edges]
named_edge_list_1 = [x[1] for x in list_edges]

indexed_edge_list_0 = [node_to_index[x]\
 for x in named_edge_list_0]
indexed_edge_list_1 = [node_to_index[x] for x in named_edge_list_1]
print(indexed_edge_list_0)
print(indexed_edge_list_1)
x = torch.FloatTensor([[1] for x in range(len(list_of_nodes))])
# Create labels for binary classification: 4 nodes labeled as 1, 3 nodes labeled as 0
y = torch.FloatTensor([1]*4 + [0]*3)
y = y.long()
print(x)
print(y)


[5, 5, 5, 0, 0, 0, 2, 3, 3, 1]
[0, 2, 4, 2, 3, 4, 1, 1, 6, 6]
tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]])
tensor([1, 1, 1, 1, 0, 0, 0])


Prepare for training and testing 

In [21]:
edge_index = torch.tensor([indexed_edge_list_0,\
 indexed_edge_list_1])

# Calculate train size (80% of nodes)
train_size = int(0.8 * len(list_of_nodes))
# Test size is the remainder to ensure complementary masks
test_size = len(list_of_nodes) - train_size

train_mask = torch.zeros(len(list_of_nodes), dtype=torch.bool)
train_mask[:train_size] = True  # First 80% for training

test_mask = torch.zeros(len(list_of_nodes), dtype=torch.bool)
test_mask[train_size:] = True  # Remaining 20% for testing

data = Data(x=x, y=y, edge_index=edge_index,\
 train_mask=train_mask, test_mask=test_mask)
print(data)
print(data.x)
print(data.y)
print(data.edge_index)
print(data.train_mask)
print(data.test_mask)
print(f"\nTrain nodes: {train_mask.sum().item()}, Test nodes: {test_mask.sum().item()}")
print(f"Masks are complementary: {(train_mask & test_mask).sum().item() == 0}")



Data(x=[7, 1], edge_index=[2, 10], y=[7], train_mask=[7], test_mask=[7])
tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]])
tensor([1, 1, 1, 1, 0, 0, 0])
tensor([[5, 5, 5, 0, 0, 0, 2, 3, 3, 1],
        [0, 2, 4, 2, 3, 4, 1, 1, 6, 6]])
tensor([ True,  True,  True,  True,  True, False, False])
tensor([False, False, False, False, False,  True,  True])

Train nodes: 5, Test nodes: 2
Masks are complementary: True


## Link Prediction with Train/Test Masks


In [None]:
import random
from torch_geometric.utils import negative_sampling

# For link prediction, we need to split edges (not nodes) into train/test
# Get all edges from the graph
num_edges = data.edge_index.size(1)
print(f"Total edges in graph: {num_edges}")

# Split edges into train (80%) and test (20%)
num_train_edges = int(0.8 * num_edges)
num_test_edges = num_edges - num_train_edges

# Shuffle edge indices
edge_indices = torch.randperm(num_edges)

# Split into train and test edge indices
train_edge_indices = edge_indices[:num_train_edges]
test_edge_indices = edge_indices[num_train_edges:]

# Create train and test edge sets
train_edges = data.edge_index[:, train_edge_indices]
test_edges = data.edge_index[:, test_edge_indices]

print(f"\nTrain edges: {train_edges.size(1)}")
print(f"Test edges: {test_edges.size(1)}")

# Create positive edge labels (1 for existing edges)
train_pos_labels = torch.ones(train_edges.size(1), dtype=torch.long)
test_pos_labels = torch.ones(test_edges.size(1), dtype=torch.long)

# Generate negative edges (non-existent edges) for training and testing
# Negative sampling ensures we don't sample edges that already exist
train_neg_edges = negative_sampling(
    edge_index=train_edges,
    num_nodes=data.num_nodes,
    num_neg_samples=train_edges.size(1)  # Same number as positive edges
)

test_neg_edges = negative_sampling(
    edge_index=data.edge_index,  # Use full graph to avoid sampling test edges
    num_nodes=data.num_nodes,
    num_neg_samples=test_edges.size(1)  # Same number as positive edges
)

# Create negative edge labels (0 for non-existing edges)
train_neg_labels = torch.zeros(train_neg_edges.size(1), dtype=torch.long)
test_neg_labels = torch.zeros(test_neg_edges.size(1), dtype=torch.long)

# Combine positive and negative edges for training
train_edge_label_index = torch.cat([train_edges, train_neg_edges], dim=1)
train_edge_label = torch.cat([train_pos_labels, train_neg_labels], dim=0)

# Combine positive and negative edges for testing
test_edge_label_index = torch.cat([test_edges, test_neg_edges], dim=1)
test_edge_label = torch.cat([test_pos_labels, test_neg_labels], dim=0)

# Create masks for link prediction
# For link prediction, we use edge_label_index and edge_label instead of node masks
link_data = Data(
    x=data.x,
    edge_index=train_edges,  # Only training edges are visible during training
    train_edge_label_index=train_edge_label_index,
    train_edge_label=train_edge_label,
    test_edge_label_index=test_edge_label_index,
    test_edge_label=test_edge_label
)

print(f"\nLink Prediction Data:")
print(f"  Node features: {link_data.x.shape}")
print(f"  Training edges (visible): {link_data.edge_index.shape[1]}")
print(f"  Train edge pairs (pos + neg): {link_data.train_edge_label_index.shape[1]}")
print(f"  Test edge pairs (pos + neg): {link_data.test_edge_label_index.shape[1]}")
print(f"\nTrain positive edges: {train_pos_labels.sum().item()}")
print(f"Train negative edges: {train_neg_labels.sum().item()}")
print(f"Test positive edges: {test_pos_labels.sum().item()}")
print(f"Test negative edges: {test_neg_labels.sum().item()}")

# Display some examples
print(f"\nExample train positive edges (first 3):")
print(train_edges[:, :3].t())
print(f"\nExample train negative edges (first 3):")
print(train_neg_edges[:, :3].t())


### Using Link Prediction Data

The `link_data` object now contains:
- `edge_index`: Only training edges (visible during training)
- `train_edge_label_index`: Edge pairs to predict during training (pos + neg)
- `train_edge_label`: Labels for training edges (1=exists, 0=doesn't exist)
- `test_edge_label_index`: Edge pairs to predict during testing (pos + neg)
- `test_edge_label`: Labels for test edges (1=exists, 0=doesn't exist)


In [None]:
# Example: Simple link prediction model structure
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

class LinkPredictor(torch.nn.Module):
    """
    Simple link prediction model:
    1. GCN encoder to get node embeddings
    2. Dot product decoder to predict edge existence
    """
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(LinkPredictor, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
    
    def encode(self, x, edge_index):
        """Encode nodes into embeddings"""
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x
    
    def decode(self, z, edge_label_index):
        """Predict edge existence from node embeddings"""
        # Get embeddings for source and target nodes
        src = z[edge_label_index[0]]
        dst = z[edge_label_index[1]]
        # Dot product as edge score
        return (src * dst).sum(dim=1)
    
    def forward(self, x, edge_index, edge_label_index):
        z = self.encode(x, edge_index)
        return self.decode(z, edge_label_index)

# Initialize model
model = LinkPredictor(in_channels=1, hidden_channels=16, out_channels=8)
print("Link Prediction Model:")
print(model)

# Example forward pass for training
train_pred = model(link_data.x, link_data.edge_index, link_data.train_edge_label_index)
print(f"\nTraining predictions shape: {train_pred.shape}")
print(f"Training labels shape: {link_data.train_edge_label.shape}")

# Example forward pass for testing
test_pred = model(link_data.x, link_data.edge_index, link_data.test_edge_label_index)
print(f"Test predictions shape: {test_pred.shape}")
print(f"Test labels shape: {link_data.test_edge_label.shape}")

# Example loss calculation (binary cross entropy)
criterion = torch.nn.BCEWithLogitsLoss()
train_loss = criterion(train_pred, link_data.train_edge_label.float())
print(f"\nExample training loss: {train_loss.item():.4f}")
