In [45]:
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
from decimal import Decimal
import pandas as pd
# PyTorch
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import torchvision.datasets as datasets
import torchvision.transforms as T
from torch_geometric.data import Data, Batch
import torch.optim as optim
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree
from torch_geometric.nn import GINConv, global_add_pool, GCNConv, global_mean_pool
import torch_geometric
from torch.nn import Linear
from torch_geometric.nn import GCNConv, GATConv, GATv2Conv, TransformerConv
import torch.nn.functional as F


#Sklearn
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold

#Networkx
import networkx as nx

## 1. Funciones

In [33]:
def calculate_node_features(graph, experiment):
     
    # Obtener los nodos y sus características
    nodes = list(graph.nodes())
    
     # Calcular las características de los nodos
        
    if experiment == 1: 
        clustering_coeffs = nx.clustering(graph)
        x = torch.tensor([[clustering_coeffs[node]] for node in nodes], dtype=torch.float)
        
    elif experiment == 2: 
        clustering_coeffs = nx.clustering(graph)
        degree = nx.degree(graph)
        x = torch.tensor([[clustering_coeffs[node], degree[node]] for node in nodes], dtype=torch.float)
        
    else:
        clustering_coeffs = nx.clustering(graph)
        degree = nx.degree(graph)
        pagerank = nx.pagerank(graph)
        x = torch.tensor([[clustering_coeffs[node], degree[node],pagerank[node]] for node in nodes], dtype=torch.float)

    return x

def load_gpickle_files(path, experiment):
    X_path = os.path.join(path, "X")  # Ruta de la carpeta con los grafos
    y_path = os.path.join(path, "y")  # Ruta de la carpeta con las etiquetas

    file_list = os.listdir(X_path)  # Obtén la lista de archivos gpickle
    dataset = []

    for file in file_list:
        file_path = os.path.join(X_path, file)  # Genera la ruta del archivo
        graph = nx.read_gpickle(file_path)  # Lee el archivo gpickle con NetworkX

        # Calcular las características de los nodos
        x = calculate_node_features(graph, experiment)
        # Obtiene la matriz dispersa de adyacencia
        adj_matrix = nx.convert_matrix.to_scipy_sparse_matrix(graph)

        # Convierte la matriz dispersa en un tensor de PyTorch
        edge_index = torch.from_numpy(np.vstack(adj_matrix.nonzero()))
#         edge_index = torch.tensor(list(graph.edges()), dtype=torch.long).t().contiguous()  # Índices de las aristas
#         adj_matrix = nx.adjacency_matrix(graph)
#         adj_matrix = adj_matrix.toarray()

        file_number = os.path.splitext(file)[0]
        with open(os.path.join(y_path, f"{file_number}.txt")) as f:
            target = f.read()
            y = torch.tensor(np.float_(target), dtype=torch.float)

        data = Data(x=x, edge_index=edge_index, y=y)  # Crea un objeto Data
        dataset.append(data)

    return dataset

def custom_collate(batch):
    # Extraer los elementos del lote y crear listas separadas para cada atributo
    x_list = []
    edge_index_list = []
    y_list = []

    for data in batch:
        x_list.append(data.x)
        edge_index_list.append(data.edge_index)
        y_list.append(data.y)

    # Convertir las listas en arreglos de numpy
    x_batch = np.stack(x_list)
    edge_index_batch = np.stack(edge_index_list)
    y_batch = np.stack(y_list)

    return Data(x=x_batch, edge_index=edge_index_batch, y=y_batch)

def plot_learning_curves(train_losses, val_losses):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(8, 6))

    # Plot losses
    plt.plot(epochs, train_losses, label='Train')
    plt.plot(epochs, val_losses, label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Curva de Aprendizaje')
    plt.legend()

    plt.tight_layout()
    plt.show()
    
def train(model, num_epochs, dataset, data_test):
    
    #model3=GNN(input_size=3, hidden_channels=3)
    learning_rate = 0.001
    # Definir la función de pérdida y el optimizador
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Listas para almacenar las pérdidas en cada época
    train_losses = []
    val_losses = []

    #num_epochs = 12
    model.train()  # Cambiar al modo de entrenamiento

    for epoch in range(num_epochs):
        running_loss = 0.0

        for data in dataset:
            x = data.x  # Características de los nodos
            edge_index = data.edge_index
            y = data.y#.view(-1)#.to(device)  # Etiquetas o clases

            optimizer.zero_grad()
            logits = model(x=x, edge_index=edge_index)
            loss = loss_fn(logits, y)
            loss.backward()  # Realizar el paso de atrás (backward)
            optimizer.step()  # Actualizar los pesos del modelo

            running_loss += loss.item()

        # Calcular la pérdida promedio en cada época
        epoch_loss = running_loss / len(dataset)
        train_losses.append(epoch_loss)

        # Realizar la validación del modelo en cada época
        model.eval()  # Cambiar al modo de evaluación

        with torch.no_grad():
            running_val_loss = 0.0

            for data in data_test:
                x_val = data.x
                edge_index_val = data.edge_index
                y_val = data.y#.view(-1)#.to(device)

                val_logits = model(x=x_val, edge_index=edge_index_val)
                val_loss = loss_fn(val_logits, y_val)

                running_val_loss += val_loss.item()

            # Calcular la pérdida promedio en la validación
            val_epoch_loss = running_val_loss / len(data_test)
            val_losses.append(val_epoch_loss)

        # Imprimir información del progreso del entrenamiento
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Val Loss: {val_epoch_loss:.4f}')

    # Crear las curvas de aprendizaje
    plot_learning_curves(train_losses, val_losses)
    
def testing(model, dataset):
    model.eval()  # Set the model in evaluation mode
    total_samples = 0
    total_loss = 0
    predictions_list = []
    labels_list = []

    learning_rate = 0.001
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    with torch.no_grad():
        for data in dataset:
            x = data.x  # Características de los nodo
            edge_index = data.edge_index
            y = data.y.unsqueeze(0)#.view(-1)#.to(device)  # Etiquetas o clases


            predictions = model(x=x, edge_index=edge_index)  # Forward pass

            loss = loss_fn(predictions, y)
            total_loss += loss.item()

            predictions_list.append(predictions.detach().cpu().numpy())
            labels_list.append(y.detach().cpu().numpy())

        # Calculate accuracy
        predictions_array = np.concatenate(predictions_list, axis=0)
        labels_array = np.concatenate(labels_list, axis=0)
        # Calculate MSE
        mse = mean_squared_error(labels_array, predictions_array)

        # Calculate MAE
        mae = mean_absolute_error(labels_array, predictions_array)

        # Calculate RMSE
        rmse = mean_squared_error(labels_array, predictions_array, squared=False)

        # Calculate R-squared
        r2 = r2_score(labels_array, predictions_array)

#         print("MSE:", mse)
#         print("MAE:", mae)
#         print("RMSE:", rmse)
#         print("R-squared:", r2)

#         #print("Accuracy: {:.4f}".format(accuracy))
#         # print("Mean Squared Error (MSE): {:.4f}".format(mse))
#         # print("R-squared (R²): {:.4f}".format(r2))
#         fig, ax = plt.subplots()
#         ax.scatter(labels_array, predictions_array)
#         ax.axline((0, 0), slope=1, color='red')
#         # Add labels and title
#         plt.xlabel("Labels")
#         plt.ylabel("Predictions")
#         plt.title("Predictions vs. Labels")
        

#         # Display the plot
#         plt.show()
        
        return mse, mae, rmse, r2
        
def cross_validate(model, dataset, num_folds=5, num_epochs=20):
    """
    Perform cross-validation for a given model and dataset.

    Args:
        model (torch.nn.Module): The PyTorch model to evaluate.
        dataset (list): The dataset containing data for cross-validation.
        num_folds (int): The number of folds for cross-validation.
    """
    kf = KFold(n_splits=num_folds)

    all_mse = []
    all_mae = []
    all_rmse = []
    all_r2 = []

    for train_idx, val_idx in kf.split(dataset):
        # Split the dataset into training and validation sets for this fold
        train_set = [dataset[i] for i in train_idx] 
        val_set = [dataset[i] for i in val_idx]

        # Train the model on the training set
        train(model, num_epochs, train_set, val_set)

        # Evaluate the model on the validation set
        mse, mae, rmse, r2 = testing(model, val_set)

        all_mse.append(mse)
        all_mae.append(mae)
        all_rmse.append(rmse)
        all_r2.append(r2)

    # Calculate and return the mean of evaluation metrics across all folds
    mean_mse = np.mean(all_mse)
    mean_mae = np.mean(all_mae)
    mean_rmse = np.mean(all_rmse)
    mean_r2 = np.mean(all_r2)

    return mean_mse, mean_mae, mean_rmse, mean_r2

## 2. Datos

In [48]:
training = load_gpickle_files(r"C:\Users\l.sanchezparra\Documents\TFG\Aprendizaje_de_la_entropia_de_un_grafo_usando_GNN\data\train", 1)  # Carga los datos con la función load_gpickle_files
validation = load_gpickle_files(r"C:\Users\l.sanchezparra\Documents\TFG\Aprendizaje_de_la_entropia_de_un_grafo_usando_GNN\data\validation", 1)  # Carga los datos con la función load_gpickle_files
test = load_gpickle_files(r"C:\Users\l.sanchezparra\Documents\TFG\Aprendizaje_de_la_entropia_de_un_grafo_usando_GNN\data\test", 1)  # Carga los datos con la función load_gpickle_files


The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `to_scipy_sparse_array` instead.
  adj_matrix = nx.convert_matrix.to_scipy_sparse_matrix(graph)


In [49]:
dataset = training + validation

## 3. Modelos GCNConv

### 3.1 One GCN Layer

In [13]:
class OneGCNLayer(torch.nn.Module):
    def __init__(self, input_size, hidden_channels):
        super(OneGCNLayer, self).__init__()
        torch.manual_seed(12345)
        
        self.conv1 = GCNConv(input_size, hidden_channels)
        
        
        self.lin = Linear(hidden_channels, 1)
    
    def forward(self, x, edge_index, batch = None,  edge_col = None):
        
        # Node embedding 
        x = self.conv1(x, edge_index, edge_col)
               
        # Readout layer
        batch = torch.zeros(x.shape[0],dtype=int) if batch is None else batch
        x = global_mean_pool(x, batch)
        
        # Final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
    
        return x

### 3.2 Two GCN Layer

In [25]:
class TwoGCNLayer(torch.nn.Module):
    def __init__(self, input_size, hidden_channels):
        super(TwoGCNLayer, self).__init__()
        torch.manual_seed(12345)
        
        self.conv1 = GCNConv(input_size, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        
        self.lin = Linear(hidden_channels, 1)
    
    def forward(self, x, edge_index, batch = None,  edge_col = None):
        
        # Node embedding 
        x = self.conv1(x, edge_index, edge_col)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_col)
        
        # Readout layer
        batch = torch.zeros(x.shape[0],dtype=int) if batch is None else batch
        x = global_mean_pool(x, batch)
        
        # Final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
    
        return x

### 3.3 Three GCN Layer

In [27]:
class ThreeGCNLayer(torch.nn.Module):
    def __init__(self, input_size, hidden_channels):
        super(ThreeGCNLayer, self).__init__()
        torch.manual_seed(12345)
        
        self.conv1 = GCNConv(input_size, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        
        self.lin = Linear(hidden_channels, 1)
    
    def forward(self, x, edge_index, batch = None,  edge_col = None):
        
        # Node embedding 
        x = self.conv1(x, edge_index, edge_col)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_col)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_col)
               
        # Readout layer
        batch = torch.zeros(x.shape[0],dtype=int) if batch is None else batch
        x = global_mean_pool(x, batch)
        
        # Final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
    
        return x

### 3.4 Four GCN Layer

In [28]:
class FourGCNLayer(torch.nn.Module):
    def __init__(self, input_size, hidden_channels):
        super(FourGCNLayer, self).__init__()
        torch.manual_seed(12345)
        
        self.conv1 = GCNConv(input_size, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
        
        
        self.lin = Linear(hidden_channels, 1)
    
    def forward(self, x, edge_index, batch = None,  edge_col = None):
        
        # Node embedding 
        x = self.conv1(x, edge_index, edge_col)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_col)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_col)
        x = x.relu()
        x = self.conv4(x, edge_index, edge_col)
               
        # Readout layer
        batch = torch.zeros(x.shape[0],dtype=int) if batch is None else batch
        x = global_mean_pool(x, batch)
        
        # Final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
    
        return x

## 4. Experimentos

### 4.1 Experimento 1

#### Dataset

In [42]:
training = load_gpickle_files(r"C:\Users\l.sanchezparra\Documents\TFG\Aprendizaje_de_la_entropia_de_un_grafo_usando_GNN\data\train", 1)  # Carga los datos con la función load_gpickle_files
validation = load_gpickle_files(r"C:\Users\l.sanchezparra\Documents\TFG\Aprendizaje_de_la_entropia_de_un_grafo_usando_GNN\data\validation", 1)  # Carga los datos con la función load_gpickle_files
test = load_gpickle_files(r"C:\Users\l.sanchezparra\Documents\TFG\Aprendizaje_de_la_entropia_de_un_grafo_usando_GNN\data\test", 1)  # Carga los datos con la función load_gpickle_files


The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `to_scipy_sparse_array` instead.
  adj_matrix = nx.convert_matrix.to_scipy_sparse_matrix(graph)


In [50]:
dataset = training + validation

#### Modelos

In [51]:
model11=OneGCNLayer(input_size=1, hidden_channels=1)
model12=TwoGCNLayer(input_size=1, hidden_channels=1)
model13=ThreeGCNLayer(input_size=1, hidden_channels=1)
model14=FourGCNLayer(input_size=1, hidden_channels=1)

models = [("OneGCNLayer" , model11), 
          ("TwoGCNLayer" , model12), 
          ("ThreeGCNLayer" ,model13),
          ("FourGCNLayer", model14)]

#### Entrenamiento y cross-validation

In [None]:
results_df = pd.DataFrame(columns=["Model", "MSE", "MAE", "R-squared"])
for model_name, model in models:
    mean_mse, mean_mae, mean_rmse, mean_r2 =cross_validate(model, dataset, num_folds=5, num_epochs=10)
    results_df = results_df.append(
        {"Model": model_name, "MSE": mse_mean, "MAE": mae_mean, "R-squared": r2_mean},
        ignore_index=True,
    )

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Train Loss: 0.1165, Val Loss: 0.0717
Epoch [2/10], Train Loss: 0.0705, Val Loss: 0.0683
