In [1]:
!pip install networkx

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [12]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[K     |████████████████████████████████| 293 kB 928 kB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import networkx as nx

import os
import numpy as np
import matplotlib.pyplot as plt
import gc

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from spektral.data import Dataset, Graph
from spektral.layers import GraphSageConv
from spektral.utils import normalized_adjacency
from spektral.models.gcn import GCN 
from spektral.datasets.utils import DATASET_FOLDER

from scipy import sparse
from scipy.special import softmax

from sklearn.model_selection import train_test_split


In [2]:

def plot_graph_dir(G, communities=None):
    pos = nx.spring_layout(G, seed=42)
    
    if communities is not None:
        # Assign colors to nodes based on their communities
        colors = ['orange' if community == 0 else 'skyblue' for community in communities]
    else:
        colors = 'red'

    nx.draw_networkx(G, pos, arrows=True, node_color=colors, with_labels=True)
    plt.show()



# Caso 1: dirigido, diferentes tamaños, NO balanceado (1%), clases separadas


Tamaños: 25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800

Clases "separadas" 

In [3]:
class synthetic_Dir_diffSize_NoBalanced_1percent_clasesSep(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 679, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [4]:
dataset1 = synthetic_Dir_diffSize_NoBalanced_1percent_clasesSep()

# Caso 2: dirigido, diferentes tamaños, NO balanceado (1%), clases mezcladas


Tamaños: 25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800

Clases "mezcladas" 

In [7]:
class synthetic_Dir_diffSize_NoBalanced_1percent_clasesMezcl(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []
        
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [8]:
dataset2 = synthetic_Dir_diffSize_NoBalanced_1percent_clasesMezcl()

# Caso 3: dirigido, diferentes tamaños, NO balanceado (2%), clases separadas


Tamaños: 25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800

Clases "separadas" (a juzgar por la representación gráfica)

In [11]:
class synthetic_Dir_diffSize_NoBalanced_2percent_clasesSep(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 679, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [12]:
dataset3 = synthetic_Dir_diffSize_NoBalanced_2percent_clasesSep()

# Caso 4: dirigido, diferentes tamaños, NO balanceado (2%), clases "mezcladas"



Tamaños: 25,50,100,200,400,800,1600,3200,6400,12800

Clases "mezcladas" 


In [15]:
class synthetic_Dir_diffSize_NoBalanced_2percent_clasesMezcl(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []
        
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [16]:
dataset4 = synthetic_Dir_diffSize_NoBalanced_2percent_clasesMezcl()

# Caso 5: dirigido, diferentes tamaños, NO balanceado (5%), clases separadas


Tamaños: 25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800

Clases "separadas" 

In [21]:
class synthetic_Dir_diffSize_NoBalanced_5percent_clasesSep(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 679, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [22]:
dataset5 = synthetic_Dir_diffSize_NoBalanced_5percent_clasesSep()

# Caso 6: dirigido, diferentes tamaños, NO balanceado (5%), clases mezcladas


Tamaños: 25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800

Clases "mezcladas"

In [25]:
class synthetic_Dir_diffSize_NoBalanced_5percent_clasesMezcl(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []
        
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [26]:
dataset6 = synthetic_Dir_diffSize_NoBalanced_5percent_clasesMezcl()

# Caso 7: dirigido, diferentes tamaños, NO balanceado (10%), clases separadas


Tamaños: 25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800

Clases "separadas" 

In [31]:
class synthetic_Dir_diffSize_NoBalanced_10percent_clasesSep(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 679, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [32]:
dataset7 = synthetic_Dir_diffSize_NoBalanced_10percent_clasesSep()

# Caso 8: dirigido, diferentes tamaños, NO balanceado (10%), clases mezcladas


Tamaños: 25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800

Clases "mezcladas"

In [35]:
class synthetic_Dir_diffSize_NoBalanced_10percent_clasesMezcl(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []
        
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [36]:
dataset8 = synthetic_Dir_diffSize_NoBalanced_10percent_clasesMezcl()

# Caso 9: dirigido, diferentes tamaños, NO balanceado (20%), clases separadas


Tamaños: 25,50,100,200,400,800,1600,3200,6400,12800

Clases "separadas" 

In [39]:
class synthetic_Dir_diffSize_NoBalanced_20percent_clasesSep(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 679, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [40]:
dataset9 = synthetic_Dir_diffSize_NoBalanced_20percent_clasesSep()

# Caso 10: dirigido, diferentes tamaños, NO balanceado (20%), clases mezcladas


Tamaños: 25,50,100,200,400,800,1600,3200,6400,12800

Clases "mezcladas"

In [43]:
class synthetic_Dir_diffSize_NoBalanced_20percent_clasesMezcl(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=features, a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [44]:
dataset10 = synthetic_Dir_diffSize_NoBalanced_20percent_clasesMezcl()

# Caso 11: dirigido, diferentes tamaños, NO balanceado (1%), clases separadas, features aplanadas

Grafos iguales a los del caso 1, salvo que las features están aplanadas a 1

In [47]:
class synthetic_Dir_diffSize_NoBalanced_1percent_clasesSep_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [48]:
dataset11 = synthetic_Dir_diffSize_NoBalanced_1percent_clasesSep_flattened()

# Caso 12: dirigido, diferentes tamaños, NO balanceado (1%), clases mezcladas, features aplanadas

Grafos iguales a los del caso 2, salvo que las features están aplanadas a 1

In [51]:
class synthetic_Dir_diffSize_NoBalanced_1percent_clasesMezcl_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((1/100)*n_nodes[i]) if (1/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [52]:
dataset12 = synthetic_Dir_diffSize_NoBalanced_1percent_clasesMezcl_flattened()

# Caso 13: dirigido, diferentes tamaños, NO balanceado (2%), clases separadas, features aplanadas

Grafos iguales a los del caso 3, salvo que las features están aplanadas a 1

In [55]:
class synthetic_Dir_diffSize_NoBalanced_2percent_clasesSep_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [56]:
dataset13 = synthetic_Dir_diffSize_NoBalanced_2percent_clasesSep_flattened()

# Caso 14: dirigido, diferentes tamaños, NO balanceado (2%), clases mezcladas, features aplanadas

Grafos iguales a los del caso 4, salvo que las features están aplanadas a 1

In [59]:
class synthetic_Dir_diffSize_NoBalanced_2percent_clasesMezcl_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((2/100)*n_nodes[i]) if (2/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [60]:
dataset14 = synthetic_Dir_diffSize_NoBalanced_2percent_clasesMezcl_flattened()

# Caso 15: dirigido, diferentes tamaños, NO balanceado (5%), clases separadas, features aplanadas

Grafos iguales a los del caso 5, salvo que las features están aplanadas a 1

In [63]:
class synthetic_Dir_diffSize_NoBalanced_5percent_clasesSep_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [64]:
dataset15 = synthetic_Dir_diffSize_NoBalanced_5percent_clasesSep_flattened()

# Caso 16: dirigido, diferentes tamaños, NO balanceado (5%), clases mezcladas, features aplanadas

Grafos iguales a los del caso 6, salvo que las features están aplanadas a 1

In [67]:
class synthetic_Dir_diffSize_NoBalanced_5percent_clasesMezcl_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((5/100)*n_nodes[i]) if (5/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [68]:
dataset16 = synthetic_Dir_diffSize_NoBalanced_5percent_clasesMezcl_flattened()

# Caso 17: dirigido, diferentes tamaños, NO balanceado (10%), clases separadas, features aplanadas

Grafos iguales a los del caso 7, salvo que las features están aplanadas a 1

In [71]:
class synthetic_Dir_diffSize_NoBalanced_10percent_clasesSep_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [72]:
dataset17 = synthetic_Dir_diffSize_NoBalanced_10percent_clasesSep_flattened()

# Caso 18: dirigido, diferentes tamaños, NO balanceado (10%), clases mezcladas, features aplanadas

Grafos iguales a los del caso 8, salvo que las features están aplanadas a 1

In [75]:
class synthetic_Dir_diffSize_NoBalanced_10percent_clasesMezcl_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((10/100)*n_nodes[i]) if (10/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [76]:
dataset18 = synthetic_Dir_diffSize_NoBalanced_10percent_clasesMezcl_flattened()

# Caso 19: dirigido, diferentes tamaños, NO balanceado (20%), clases separadas, features aplanadas

Grafos iguales a los del caso 9, salvo que las features están aplanadas a 1

In [79]:
class synthetic_Dir_diffSize_NoBalanced_20percent_clasesSep_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                        [0.8, 0.2],
                                        [0.3, 0.7]
                                    ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [2, 1], #, 0, 0, 0],
                                        [1, 2] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Bien separadas las clases:
        graphs1 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 0, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz')
            np.savez(filename, x=graphs1[j].x, a=graphs1[j].a, y=graphs1[j].y)

        # Free memory
        del graphs1
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data1 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesSep_0{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data1['x'], a=data1['a'][()], y=data1['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [80]:
dataset19 = synthetic_Dir_diffSize_NoBalanced_20percent_clasesSep_flattened()

# Caso 20: dirigido, diferentes tamaños, NO balanceado (20%), clases mezcladas, features aplanadas

Grafos iguales a los del caso 10, salvo que las features están aplanadas a 1

In [83]:
class synthetic_Dir_diffSize_NoBalanced_20percent_clasesMezcl_flattened(Dataset):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @property
    def path(self):
        return os.path.join(DATASET_FOLDER, "syntheticGraphs", self.__class__.__name__)
    
    def download(self):
        os.makedirs(self.path)
        
        def generate_synthetic_graph_csbm(n_nodes, n_communities, n_features, edge_prob_matrix, node_feature_means,\
                                  n_infected, semilla, indice, feature_cov_matrix=None):
            # Assign nodes to communities
            np.random.seed(semilla*(indice+1))
            indices = [np.random.randint(0,n_nodes) for i in range(n_infected)]
            communities = np.array([int(j in indices) for j in range(n_nodes)])

            # Generate node features
            if feature_cov_matrix is None:
                feature_cov_matrix = np.eye(n_features)
            features = np.zeros((n_nodes, n_features))
            for k in range(n_communities):
                nodes_in_community = np.where(communities == k)[0]
                features[nodes_in_community] = np.random.multivariate_normal(node_feature_means[k], feature_cov_matrix,\
                                                                             len(nodes_in_community))

            # Compute community membership probabilities based on node features
            community_membership_probs = softmax(features @ node_feature_means.T, axis=1)

            # Generate edges based on community membership probabilities
            adjacency_matrix = np.zeros((n_nodes, n_nodes))
            for i in range(n_nodes):
                for j in range(n_nodes):
                    if i == j:
                        continue
                    community_i = communities[i]
                    community_j = communities[j]
                    edge_prob = edge_prob_matrix[community_i, community_j] * community_membership_probs[i, community_j] * community_membership_probs[j, community_i]
                    adjacency_matrix[i, j] = np.random.binomial(1, edge_prob)

            labels = tf.keras.utils.to_categorical(communities)
            adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
            return Graph(x=np.ones((n_nodes, n_features)), a=adjacency_matrix, y=labels)

        
        n_graphs = 10
        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_features = 2
        n_classes = 2
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        # Probability matrix for edges between communities
        edge_prob_matrix = np.array([
                                    [0.5, 0.5],
                                    [0.5, 0.5]
                            ])
        # Node feature means for each community
        node_feature_means = np.array([
                                        [1, 1], #, 0, 0, 0],
                                        [1, 1] #, 0, 0, 0]
                                    ])
        
        semillas = [123, 234, 345, 456, 567, 678, 789, 321, 654, 987]
        
        # Clases un poco más juntas:
        graphs2 = [generate_synthetic_graph_csbm(n_nodes[i], n_classes, n_features, edge_prob_matrix, node_feature_means, n_infected[i], semillas[i], 1, feature_cov_matrix=None) for i in range(n_graphs)]
        for j in range(10):
            filename = os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz')
            np.savez(filename, x=graphs2[j].x, a=graphs2[j].a, y=graphs2[j].y)

        # Free memory
        del graphs2
        gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []

        n_nodes = [25, 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
        n_infected = [round((20/100)*n_nodes[i]) if (20/100)*n_nodes[i]>=1  else 1 for i in range(len(n_nodes))]
        
        for j in range(10):
            data2 = np.load(os.path.join(self.path, f'graph_Dir_NoBalanced_{n_infected[j]}a{n_nodes[j]}_clasesMezcl_2{j}_flattened.npz'), allow_pickle=True)
            output.append(
                Graph(x=data2['x'], a=data2['a'][()], y=data2['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


In [84]:
dataset20 = synthetic_Dir_diffSize_NoBalanced_20percent_clasesMezcl_flattened()