In [29]:
import pandas as pd
import networkx as nx
import seaborn as sns
from gensim.models import Word2Vec
from node2vec import Node2Vec
from torch_geometric.data import Data
import csv
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [74]:
#Load GraphMissingEdges
G = nx.read_gml("../predio-de-links-utfpr-2024-1/GraphMissingEdges.gml",'id')

In [31]:
#Load Categories
categories_map = pd.read_csv('../predio-de-links-utfpr-2024-1/categories.csv')

In [32]:
# Load the edges to evaluate
edges_to_evaluate = pd.read_csv('../predio-de-links-utfpr-2024-1/edgesToEvaluate.csv')

In [33]:
# Número de nós e arestas
num_nodes = len(G.nodes)
num_edges = len(G.edges)

# Densidade da rede
density = nx.density(G)

# Grau médio dos nós
avg_degree = sum(dict(G.degree()).values()) / num_nodes

print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)
print("Network density:", density)
print("Average node degree:", avg_degree)

Number of nodes: 4575
Number of edges: 18991
Network density: 0.0018150582646987846
Average node degree: 8.302076502732241


In [35]:
def get_node_embeddings(G,dimensions=32):
    # Precompute the random walks
    num_walks = 10
    walk_length = 80
    p = 1.0  # Return parameter
    q = 1.0  # In-out parameter

    node2vec = Node2Vec(G, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, p=p, q=q,workers=16)

    # # Train the node2vec model
    model = node2vec.fit(window=10, min_count=1, batch_words=4)

    # # Get the node embeddings
    node_embeddings = model.wv
    return(node_embeddings)

In [79]:
import torch
from torch_geometric.data import Data
import torch_geometric.transforms as T
import numpy as np

# Convert the NetworkX graph to PyTorch Geometric Data object
data = Data()

# Add empty features to the nodes
node_embeddings = get_node_embeddings(G)
data.x = torch.tensor(node_embeddings.vectors)

# Add edges to the graph
edge_index = []
for u, v in G.edges:
    
    edge_index.append([int(u), int(v)])
    edge_index.append([int(v), int(u)]) # Assuming an undirected graph, add reverse edges as well

Computing transition probabilities: 100%|██████████| 4575/4575 [00:16<00:00, 277.12it/s]


In [80]:
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
data.edge_index = edge_index

In [81]:
data.x.size()

torch.Size([4575, 32])

In [82]:
from torch import Tensor
import torch.nn.functional as F
import torch_geometric.nn as nn
from torch_geometric.nn import SAGEConv,GCNConv
import torch.nn as nn_
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.loader import LinkNeighborLoader

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  average_precision_score, roc_auc_score

from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

In [83]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [84]:
GNN_input = data.x.size()[1]
GNN_hiddens = 256
MLP_input = GNN_hiddens//2
epochs=60, 
lr_gnn=0.0001
lr_lc=0.001 
display=1
patience=5
batch_size=2

In [85]:
def Get_data_loaders(dataset, num_val=0.1, num_test=0.1, disjoint_train_ratio=0.3, neg_sampling_ratio=1, batch_size=16):
    transformer = RandomLinkSplit(
        num_val=num_val,
        num_test=num_test,
        disjoint_train_ratio=disjoint_train_ratio,
        neg_sampling_ratio=neg_sampling_ratio,
        add_negative_train_samples=True)

    train_data, val_data, test_data = transformer(dataset)

    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=[-1],
        neg_sampling_ratio=0,
        edge_label_index=train_data.edge_label_index,
        edge_label=train_data.edge_label,
        batch_size=batch_size,
        shuffle=True,
    )

    val_loader = LinkNeighborLoader(
        data=val_data,
        num_neighbors=[-1],
        neg_sampling_ratio=0,
        edge_label_index=val_data.edge_label_index,
        edge_label=val_data.edge_label,
        batch_size=batch_size,
        shuffle=True,
    )

    test_loader = LinkNeighborLoader(
        data=test_data,
        num_neighbors=[-1],
        neg_sampling_ratio=0,
        edge_label_index=test_data.edge_label_index,
        edge_label=test_data.edge_label,
        batch_size=batch_size,
        shuffle=True,
    )
    return train_loader, val_loader, test_loader

In [86]:
G.nodes

NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 

In [87]:
for node in G.nodes:
    print(node)
    print(G.nodes[node])

0
{'label': 'ql0AaBp68ckekxvWOF8xLA', 'longitude': -79.992016, 'latitude': 40.4387014, 'categories': '1,362', 'stars': '3.0', 'name': 'Cafe Fifth Ave', 'reviewCount': '20'}
1
{'label': 'WHxonk9W_sRLk8cwOoZQqQ', 'longitude': -80.0647887, 'latitude': 40.4360448, 'categories': '280,566', 'stars': '4.0', 'name': 'Good Fellas Barber Shop', 'reviewCount': '12'}
2
{'label': 'P6HDtlj1GSu9UG2Aal2PPg', 'longitude': -79.9798403, 'latitude': 40.386165, 'categories': '327,559,107,63,213', 'stars': '3.0', 'name': 'Tightspot Dancewear Center', 'reviewCount': '4'}
3
{'label': '3kUqNxO1rkDDb89GAfyNgw', 'longitude': -79.9254041, 'latitude': 40.4579728, 'categories': '338,280,247,292,671,546', 'stars': '4.5', 'name': 'Evolve Wellness Spa Shadyside', 'reviewCount': '95'}
4
{'label': 'v_pED2nMFPsBGD4Tq2ygBw', 'longitude': -80.001983, 'latitude': 40.438355, 'categories': '407,247,270,645,438,488', 'stars': '2.0', 'name': 'Nova Dental Associates', 'reviewCount': '5'}
5
{'label': 'nZDIrGshkfLZf6ImQtAasQ', 'lo

In [88]:
train_loader, val_loader, test_loader = Get_data_loaders(data, batch_size=batch_size)
print(len(train_loader)),print(len(val_loader)),print(len(test_loader))

9115
3798
3798


(None, None, None)

In [89]:
from torch_geometric.nn import GCNConv

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels) 
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

In [90]:
class GNN(torch.nn.Module):
    def __init__(self, input ,hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(input, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels//2)

    def forward(self, x, edge_index):

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x


class Classifier(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Classifier, self).__init__()

        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim//2)
        self.fc3 = torch.nn.Linear(hidden_dim//2, output_dim)
        self.relu = torch.nn.ReLU()
        self.sigm = torch.nn.Sigmoid()
        self.softmax = torch.nn.Softmax(dim=-1)

    def forward(self, x):
        # Pass the input through the MLP layers with ReLU activation
        x = self.relu(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.relu(self.fc2(x))
        x = F.dropout(x, p=0.5, training=self.training)
        # Apply the final linear layer
        x = self.fc3(x)
        # Apply softmax activation to get probabilities
        x = self.sigm(x)
        return x

In [91]:
gnn = GNN(input=GNN_input, hidden_channels=GNN_hiddens)

gnn_optim = torch.optim.Adam(gnn.parameters(), lr=lr_gnn)
gnn_loss = F.cross_entropy

lc = Classifier(MLP_input*2, 128, 1)

lc_optim = torch.optim.Adam(lc.parameters(), lr=lr_lc)
# lc_loss = F.binary_cross_entropy
lc_loss = nn_.BCELoss()

In [93]:
from tqdm import tqdm

In [94]:
epochs=60
best_roc = 0
epochs_no_improve = 0

val_roc = []
train_roc = []

train_loss_gnn = []
train_loss_lc = []

val_loss_gnn = []
val_loss_lc = []
for epoch in tqdm(range(epochs), desc='Epochs', position=0):
    total_loss_gnn = 0
    total_loss_lc = 0

    y_true = []
    y_pred_score = []
    gnn.train()
    lc.train()
    for sampled_data in train_loader:

        gnn_optim.zero_grad()
        lc_optim.zero_grad()

        # Move `sampled_data` to the respective `device`
        sampled_data = sampled_data.to(device)

        # Run `forward` pass of the model
        emb = gnn(sampled_data.x, sampled_data.edge_index)

        # Get the ground truth labels from `sampled_data`
        ground_truth = sampled_data.edge_label.to(device)

        # Get link embaddings
        edge_feat_1 = torch.index_select(
            emb, 0, sampled_data.edge_label_index[0])
        edge_feat_2 = torch.index_select(
            emb, 0, sampled_data.edge_label_index[1])

        link_emb = torch.cat([edge_feat_1, edge_feat_2], dim=-1)

        # Get lgnn loss
        loss = gnn_loss(link_emb.to(torch.float),
                        ground_truth.to(torch.long))

        loss.backward(retain_graph=True)

        # Link classifier
        pred = lc(link_emb)
        
        loss2 = lc_loss(pred.squeeze(), ground_truth)
        loss2.backward()

        lc_optim.step()
        gnn_optim.step()

        pred_binary = torch.detach(pred).cpu().numpy()
        ground_truth_cpu = ground_truth.cpu().numpy()

        y_pred_score = np.hstack([y_pred_score, pred_binary.squeeze()])
        y_true = np.hstack([y_true, ground_truth_cpu.squeeze()])

        total_loss_gnn += loss.item()
        total_loss_lc += loss2.item()

    
    train_roc.append(roc_auc_score(y_true, y_pred_score))
    train_loss_gnn.append(total_loss_gnn/len(train_loader))
    train_loss_lc.append(total_loss_lc/len(train_loader))

    gnn.eval()
    lc.eval()
    with torch.no_grad():
        total_loss_gnn = 0
        total_loss_lc = 0

        y_pred_score = []
        y_true = []
        for sampled_data in val_loader:

            # Move `sampled_data` to the respective `device`
            sampled_data = sampled_data.to(device)

            # Run `forward` pass of the model
            emb = gnn(sampled_data.x, sampled_data.edge_index)

            # Get the ground truth labels from `sampled_data`
            ground_truth = sampled_data.edge_label.to(device)

            edge_feat_1 = torch.index_select(
                emb, 0, sampled_data.edge_label_index[0])
            edge_feat_2 = torch.index_select(
                emb, 0, sampled_data.edge_label_index[1])

            link_emb = torch.cat([edge_feat_1, edge_feat_2], dim=-1)

            total_loss_gnn += gnn_loss(link_emb.to(torch.float),
                                       ground_truth.to(torch.long)).item()
            # Link classifier
            pred = lc(link_emb)
            total_loss_lc += lc_loss(pred.squeeze(), ground_truth).item()

            pred_binary = torch.detach(pred).cpu().numpy()
            ground_truth_cpu = ground_truth.cpu().numpy()
            y_pred_score = np.hstack([y_pred_score, pred_binary.squeeze()])
            y_true = np.hstack([y_true, ground_truth_cpu.squeeze()])
        
        val_roc.append(roc_auc_score(y_true, y_pred_score))
        val_loss_gnn.append(total_loss_gnn/len(val_loader))
        val_loss_lc.append(total_loss_lc/len(val_loader))

        
        if val_roc[-1] > best_roc:
            best_roc = val_roc[-1]

            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epoch % display == 0:
            tqdm.write(
                f"Epoch: {epoch+1}/{epochs} |   Train roc: {train_roc[-1]:.4f}| val roc: {val_roc[-1]:.4f} best roc: {best_roc:.4f}")

    # Check if early stopping criteria met
    if epochs_no_improve >= patience:

        break

TypeError: 'module' object is not callable

In [None]:
plt.subplots(1, 3, figsize=(30, 10))
# Plotting the accuracy curves
plt.subplot(1, 3, 1)
plt.plot(train_roc, label='Train Accuracy')
plt.plot(val_roc, label='Validation Accuracy')
plt.legend()
plt.subplot(1, 3, 2)
plt.plot(train_loss_gnn, label='train loss')
plt.plot(val_loss_gnn, label='vall loss')
plt.legend()
plt.subplot(1, 3, 3)
plt.plot(train_loss_lc, label='train loss')
plt.plot(val_loss_lc, label='val loss')

# Adding legend
plt.legend()

In [20]:
# parâmetros
epochs = 100

# modelo - Graph Auto-Encoder (GAE)
model = GCNEncoder(train_data.num_node_features, 64)
model = model.to(device)

criterion = torch.nn.BCEWithLogitsLoss()

print(model)


# inicialização o optimizador
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

GCNEncoder(
  (conv1): GCNConv(0, 128)
  (conv2): GCNConv(128, 64)
)


In [23]:
from torch_geometric.utils import negative_sampling
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def train(data):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index)

    # Nova rodada de amostragem negativa para cada época de treinamento:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1))

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss


def test(data):
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x, data.edge_index)
    
    out = model.decode(z, data.edge_label_index).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())


In [24]:
# Decodificação do teste expandida

z = model.encode(test_data.x, test_data.edge_index)

temp = model.decode(z, test_data.edge_label_index).sigmoid()

threshold = torch.tensor([0.5]).to(device)

#Atribui 0 ou 1 de acordo com o threshold
results = (temp>threshold).float()

print(threshold)
print(f1_score(test_data.edge_label.cpu().numpy() ,results.cpu().numpy() ))

print()
print("Testando com mais limites de corte:")
print()

for i in np.arange(0.1, 0.9, 0.01):
    threshold = torch.tensor([i]).to(device)
    results = (temp>threshold).float()

    
    if (f1_score(test_data.edge_label.cpu().numpy() ,results.cpu().numpy() ) > 0.80):
        print(threshold)
        print(f1_score(test_data.edge_label.cpu().numpy() ,results.cpu().numpy() ))
        print('---')


AttributeError: 'NoneType' object has no attribute 'size'

In [12]:
# import torch
# from torch_geometric.nn import GCNConv
# from torch_geometric.utils import negative_sampling
# from sklearn.metrics import f1_score
# import torch.nn.functional as F

# class LinkPredictor(torch.nn.Module):
#     def __init__(self, num_features, hidden_channels):
#         super(LinkPredictor, self).__init__()
#         self.conv1 = GCNConv(num_features, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
#         self.classifier = torch.nn.Linear(2*hidden_channels, 1)

#     def forward(self, x, edge_index):
#         x1 = torch.relu(self.conv1(x, edge_index))
#         x2 = self.conv2(x1, edge_index)
#         # Concatenate the embeddings of the source and target nodes
#         edge_embeddings = torch.cat([x2[edge_index[0]], x2[edge_index[1]]], dim=-1)
#         return self.classifier(edge_embeddings)

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = LinkPredictor(num_features=train_data.num_node_features, hidden_channels=64).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# def train():
#     model.train()
#     optimizer.zero_grad()
#     # Use all nodes for training 
#     out = model(train_data.x, train_data.edge_index)
#     # We apply the sigmoid function to the output to get probabilities
#     out_probs = torch.sigmoid(out)
#     # We use binary cross entropy loss
#     loss = F.binary_cross_entropy(out_probs.view(-1), train_data.y.float())
#     loss.backward()
#     optimizer.step()
#     return loss.item()

# def test(data):
#     model.eval()
#     with torch.no_grad():
#         out = model(data.x, data.edge_index)
#         out_probs = torch.sigmoid(out)
#         # We apply a threshold of 0.5 to determine whether an edge exists
#         out_preds = (out_probs > 0.5).float().view(-1)
#         f1 = f1_score(data.y.cpu(), out_preds.cpu())
#         return f1

# for epoch in range(1, 101):
#     loss = train()
#     val_f1 = test(val_data)
#     print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val F1: {val_f1:.4f}')

# test_f1 = test(test_data)
# print(f'Test F1: {test_f1:.4f}')


AttributeError: 'NoneType' object has no attribute 'float'