<a href="https://colab.research.google.com/github/jscszn/gnn/blob/main/gnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random

individuals = [f'user_{i}' for i in range(20)]
print(f"Generated {len(individuals)} individuals: {individuals[:5]}...")

possible_interests = ['sports', 'music', 'technology', 'art', 'reading', 'gaming', 'travel', 'cooking', 'movies', 'photography']
print(f"Possible interests: {possible_interests}")

individual_interests = {}
for individual in individuals:
    num_interests = random.randint(2, 4)
    assigned_interests = random.sample(possible_interests, num_interests)
    individual_interests[individual] = assigned_interests

print("\nSample of individual interests:")
for i, (user, interests) in enumerate(individual_interests.items()):
    if i < 3:
        print(f"{user}: {interests}")
    else:
        break

friendship_connections = []

shared_interest_threshold = 2

for i in range(len(individuals)):
    for j in range(i + 1, len(individuals)):
        user1 = individuals[i]
        user2 = individuals[j]

        interests1 = set(individual_interests[user1])
        interests2 = set(individual_interests[user2])

        shared_interests = interests1.intersection(interests2)

        if len(shared_interests) >= shared_interest_threshold:
            friendship_connections.append((user1, user2))

print(f"\nTotal friendship connections generated: {len(friendship_connections)}")
print(f"Sample of friendship connections: {friendship_connections[:5]}...")


Generated 20 individuals: ['user_0', 'user_1', 'user_2', 'user_3', 'user_4']...
Possible interests: ['sports', 'music', 'technology', 'art', 'reading', 'gaming', 'travel', 'cooking', 'movies', 'photography']

Sample of individual interests:
user_0: ['music', 'art']
user_1: ['travel', 'reading']
user_2: ['sports', 'art']

Total friendship connections generated: 38
Sample of friendship connections: [('user_0', 'user_3'), ('user_0', 'user_5'), ('user_0', 'user_14'), ('user_2', 'user_18'), ('user_3', 'user_4')]...


In [None]:
pip install torch_geometric torch_scatter



In [None]:
import torch
from torch_geometric.data import Data

user_to_id = {user: i for i, user in enumerate(individuals)}
print(f"Generated user_to_id mapping. Sample: {list(user_to_id.items())[:3]}...")

all_possible_interests_sorted = sorted(list(possible_interests))

node_features = []
for individual in individuals:
    current_user_interests = individual_interests[individual]
    feature_vector = [1 if interest in current_user_interests else 0 for interest in all_possible_interests_sorted]
    node_features.append(feature_vector)

x = torch.tensor(node_features, dtype=torch.float)
print(f"\nGenerated node features (x) with shape: {x.shape}")
print(f"Sample of node features for user_0: {x[user_to_id['user_0']]}")


edges = []
for u1, u2 in friendship_connections:
    id1 = user_to_id[u1]
    id2 = user_to_id[u2]
    edges.append((id1, id2))

undirected_edges = set()
for u, v in edges:
    undirected_edges.add((u, v))
    undirected_edges.add((v, u))

edge_list_sources = [edge[0] for edge in undirected_edges]
edge_list_targets = [edge[1] for edge in undirected_edges]
edge_index = torch.tensor([edge_list_sources, edge_list_targets], dtype=torch.long)

print(f"\nGenerated edge_index with shape: {edge_index.shape}")
print(f"Sample of edge_index (first 5 connections):\n{edge_index[:, :5]}")

data_graph = Data(x=x, edge_index=edge_index)
print(f"\nCreated PyTorch Geometric Data object:\n{data_graph}")
print(f"Number of nodes: {data_graph.num_nodes}")
print(f"Number of edges: {data_graph.num_edges}")
print(f"Number of features per node: {data_graph.num_node_features}")

Generated user_to_id mapping. Sample: [('user_0', 0), ('user_1', 1), ('user_2', 2)]...

Generated node features (x) with shape: torch.Size([20, 10])
Sample of node features for user_0: tensor([1., 0., 0., 0., 1., 0., 0., 0., 0., 0.])

Generated edge_index with shape: torch.Size([2, 76])
Sample of edge_index (first 5 connections):
tensor([[ 6,  3, 14,  4,  3],
        [12,  4,  4,  3,  7]])

Created PyTorch Geometric Data object:
Data(x=[20, 10], edge_index=[2, 76])
Number of nodes: 20
Number of edges: 76
Number of features per node: 10


In [None]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv

class GNNLinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNNLinkPredictor, self).__init__()

        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):

        x = self.conv1(x, edge_index)
        x = torch.relu(x)

        z = self.conv2(x, edge_index)

        return z


    def decode_edge(self, z, edge_label_index):

        src_nodes = edge_label_index[0]
        dst_nodes = edge_label_index[1]

        src_embeddings = z[src_nodes]
        dst_embeddings = z[dst_nodes]


        scores = torch.sum(src_embeddings * dst_embeddings, dim=1)

        return torch.sigmoid(scores)


input_dim = data_graph.num_node_features
hidden_dim = 16
output_dim = 16

gnn_model = GNNLinkPredictor(input_dim, hidden_dim, output_dim)

print(gnn_model)

GNNLinkPredictor(
  (conv1): GCNConv(10, 16)
  (conv2): GCNConv(16, 16)
)


In [None]:
from torch_geometric.transforms import RandomLinkSplit


transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    add_negative_train_samples=False,

)


train_data, val_data, test_data = transform(data_graph)

print(f"Original graph: {data_graph}")
print(f"Training graph: {train_data}")
print(f"Validation graph: {val_data}")
print(f"Test graph: {test_data}")

print(f"\nTrain message passing edges: {train_data.edge_index.shape[1]} edges")
print(f"Train link prediction edges (positive and negative): {train_data.edge_label_index.shape[1]} edges")
print(f"Validation message passing edges: {val_data.edge_index.shape[1]} edges")
print(f"Validation link prediction edges (positive and negative): {val_data.edge_label_index.shape[1]} edges")
print(f"Test message passing edges: {test_data.edge_index.shape[1]} edges")
print(f"Test link prediction edges (positive and negative): {test_data.edge_label_index.shape[1]} edges")

Original graph: Data(x=[20, 10], edge_index=[2, 76])
Training graph: Data(x=[20, 10], edge_index=[2, 64], edge_label=[32], edge_label_index=[2, 32])
Validation graph: Data(x=[20, 10], edge_index=[2, 64], edge_label=[6], edge_label_index=[2, 6])
Test graph: Data(x=[20, 10], edge_index=[2, 70], edge_label=[6], edge_label_index=[2, 6])

Train message passing edges: 64 edges
Train link prediction edges (positive and negative): 32 edges
Validation message passing edges: 64 edges
Validation link prediction edges (positive and negative): 6 edges
Test message passing edges: 70 edges
Test link prediction edges (positive and negative): 6 edges


In [None]:
import torch.optim as optim
import torch.nn.functional as F


optimizer = optim.Adam(gnn_model.parameters(), lr=0.01)



class GNNLinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNNLinkPredictor, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        z = self.conv2(x, edge_index)
        return z

    def decode_edge(self, z, edge_label_index):
        src_nodes = edge_label_index[0]
        dst_nodes = edge_label_index[1]
        src_embeddings = z[src_nodes]
        dst_embeddings = z[dst_nodes]
        scores = torch.sum(src_embeddings * dst_embeddings, dim=1)
        return scores

gnn_model = GNNLinkPredictor(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(gnn_model.parameters(), lr=0.01)

criterion = nn.BCEWithLogitsLoss()

epochs = 200

print("\nStarting training...")
for epoch in range(1, epochs + 1):

    gnn_model.train()

    optimizer.zero_grad()

    z = gnn_model(train_data.x, train_data.edge_index)


    out = gnn_model.decode_edge(z, train_data.edge_label_index)


    loss = criterion(out, train_data.edge_label.float())


    loss.backward()
    optimizer.step()


    gnn_model.eval()
    with torch.no_grad():

        val_z = gnn_model(val_data.x, val_data.edge_index)

        val_out = gnn_model.decode_edge(val_z, val_data.edge_label_index)

        val_loss = criterion(val_out, val_data.edge_label.float())


    if epoch % 10 == 0 or epoch == 1:
        print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Val Loss: {val_loss:.4f}')

print("Training complete.")


Starting training...
Epoch: 001, Train Loss: 0.3134, Val Loss: 0.7977
Epoch: 010, Train Loss: 0.0000, Val Loss: 5.4889
Epoch: 020, Train Loss: 0.0000, Val Loss: 11.2683
Epoch: 030, Train Loss: 0.0000, Val Loss: 14.5517
Epoch: 040, Train Loss: 0.0000, Val Loss: 16.0483
Epoch: 050, Train Loss: 0.0000, Val Loss: 16.6629
Epoch: 060, Train Loss: 0.0000, Val Loss: 16.9039
Epoch: 070, Train Loss: 0.0000, Val Loss: 16.9960
Epoch: 080, Train Loss: 0.0000, Val Loss: 17.0307
Epoch: 090, Train Loss: 0.0000, Val Loss: 17.0436
Epoch: 100, Train Loss: 0.0000, Val Loss: 17.0484
Epoch: 110, Train Loss: 0.0000, Val Loss: 17.0502
Epoch: 120, Train Loss: 0.0000, Val Loss: 17.0508
Epoch: 130, Train Loss: 0.0000, Val Loss: 17.0511
Epoch: 140, Train Loss: 0.0000, Val Loss: 17.0511
Epoch: 150, Train Loss: 0.0000, Val Loss: 17.0512
Epoch: 160, Train Loss: 0.0000, Val Loss: 17.0512
Epoch: 170, Train Loss: 0.0000, Val Loss: 17.0512
Epoch: 180, Train Loss: 0.0000, Val Loss: 17.0512
Epoch: 190, Train Loss: 0.0000

In [None]:
import torch

gnn_model.eval()

with torch.no_grad():
    z_test = gnn_model(test_data.x, test_data.edge_index)

    predicted_logits = gnn_model.decode_edge(z_test, test_data.edge_label_index)

    predicted_probabilities = torch.sigmoid(predicted_logits)

predicted_probabilities_test = predicted_probabilities
true_labels_test = test_data.edge_label.float()

print(f"Predicted probabilities for test set: {predicted_probabilities_test[:5]}")
print(f"True labels for test set: {true_labels_test[:5]}")
print(f"Shape of predicted probabilities: {predicted_probabilities_test.shape}")
print(f"Shape of true labels: {true_labels_test.shape}")

Predicted probabilities for test set: tensor([1., 1., 1., 1., 1.])
True labels for test set: tensor([1., 1., 1., 0., 0.])
Shape of predicted probabilities: torch.Size([6])
Shape of true labels: torch.Size([6])


In [None]:
import pandas as pd

id_to_user = {v: k for k, v in user_to_id.items()}

predicted_friendship_pairs_users = []
for i in range(test_data.edge_label_index.shape[1]):
    user_id_u = test_data.edge_label_index[0, i].item()
    user_id_v = test_data.edge_label_index[1, i].item()
    user_u_name = id_to_user[user_id_u]
    user_v_name = id_to_user[user_id_v]
    predicted_friendship_pairs_users.append((user_u_name, user_v_name))

print(f"Converted {len(predicted_friendship_pairs_users)} edge label indices to user pairs. Sample: {predicted_friendship_pairs_users[:5]}")

Converted 6 edge label indices to user pairs. Sample: [('user_5', 'user_14'), ('user_14', 'user_18'), ('user_7', 'user_16'), ('user_7', 'user_17'), ('user_7', 'user_12')]


In [None]:
import pandas as pd

data = {
    'Friendship Pair': [f'{u}-{v}' for u, v in predicted_friendship_pairs_users],
    'Predicted Probability': predicted_probabilities_test.cpu().numpy(),
    'True Label (Friendship Exists)': true_labels_test.cpu().numpy().astype(int)
}
df_predictions = pd.DataFrame(data)

df_predictions_sorted = df_predictions.sort_values(by='Predicted Probability', ascending=False)

print("\nTop 10 Predicted Friendships (and their true labels, if they existed in test set):\n")
print(df_predictions_sorted.head(10).to_string(index=False))


Top 10 Predicted Friendships (and their true labels, if they existed in test set):

Friendship Pair  Predicted Probability  True Label (Friendship Exists)
 user_5-user_14                    1.0                               1
user_14-user_18                    1.0                               1
 user_7-user_16                    1.0                               1
 user_7-user_17                    1.0                               0
 user_7-user_12                    1.0                               0
user_16-user_10                    1.0                               0
