In [None]:
!pip install spektral

In [21]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout,Input
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from spektral.layers import GINConv,GCNConv #, GCSConv, GlobalAvgPool
from spektral.utils.sparse import sp_matrix_to_sp_tensor
from spektral.data import DisjointLoader, BatchLoader, Dataset, Graph
#from spektral.transforms.normalize_adj import NormalizeAdj
import gc
import spektral.datasets

### Ignorar las siguientes 2 celdas

Primeras pruebas, para ver si los métodos dentro de la clase tenían chance de funcionar ;)

In [None]:
captures = ["10","11","12","15","15-2","16","16-2","16-3","17","18","18-2","19","15-3"]

for i in captures:
    x_tmp = pd.read_csv(f'/mnt/features-prefix/capture201108{i}_features_prefix.csv', sep=",", header=0)
    class_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["label"].unique()))}
    node_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["node"].unique()))}
    # Cambiamos los nodos y clases por su correspondiente número entero, en las features y en los grafos
    x_tmp["node"] = x_tmp["node"].apply(lambda name: node_idx[name])
    x_tmp["label"] = x_tmp["label"].apply(lambda value: class_idx[value])
    x = tf.cast(x_tmp.sort_values("node")[x_tmp.columns.difference(["node","label"], sort=False)].to_numpy(), dtype=tf.dtypes.float32)                
   
    a_tmp = pd.read_csv(f'/mnt/ncol-prefix/capture201108{i}_ncol_prefix.ncol', sep=" ", header=None, names=["source", "target", "weight"])
    a_tmp["source"] = a_tmp["source"].apply(lambda name: node_idx[name])
    a_tmp["target"] = a_tmp["target"].apply(lambda name: node_idx[name])
    a_source_tmp = a_tmp[["source"]].to_numpy().T
    a_source = np.reshape(a_source_tmp, a_source_tmp.shape[-1])
    a_target_tmp = a_tmp[["target"]].to_numpy().T
    a_target = np.reshape(a_target_tmp, a_target_tmp.shape[-1])
    a_weight_tmp = a_tmp[["weight"]].to_numpy().T
    a_weight = np.reshape(a_weight_tmp, a_weight_tmp.shape[-1])
    a = sparse.coo_matrix((a_weight, (a_source, a_target)), shape=(x.shape[0], x.shape[0]))

    y = tf.cast(x_tmp.sort_values("node")["label"].to_numpy(), dtype=tf.dtypes.int64)

    filename = f'/mnt/grafos_npz2/graph_201108{i}.npz'
    np.savez(filename, x=x, a=a, y=y)
    
    del x_tmp, feature_names, a_tmp, a_source_tmp, a_source, a_target_tmp, a_target, a_weight_tmp, a_weight
    gc.collect()


In [None]:
output = []
captures = ["10","11","12","15","15-2","16","16-2","18","18-2","15-3"] # for training
for i in captures:
    data = np.load(f'/mnt/grafos_npz2/graph_201108{i}.npz', allow_pickle=True)
    output.append(Graph(x=data['x'], a=data['a'][()], y=data['y']))


# Creamos clase CTU13

Según https://graphneural.network/creating-dataset/ para crear un dataset propio hay que crear una clase e incluir un método `download` y uno `read`: si al querer leer los datos, no los encuentra en el path correspondiente, entonces los descarga.
En nuestro caso, además, lo que hace la "descarga" es "acomodar" los datos a un formato propio de Spektral.

In [35]:
class CTU13(Dataset):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def download(self):
        os.mkdir(self.path)
        captures = ["10","11","12","15","15-2","16","16-2","16-3","17","18","18-2","19","15-3"]

        for i in captures:
            # x = nodes features (ID, OD, IDW, ODW)
            # a = adjacency matrix
            # y =labels
            
            # Read files with nodes features (csv file) and connections between nodes (ncol file)
            x_tmp = pd.read_csv(f'/mnt/features-prefix/capture201108{i}_features_prefix.csv', sep=",", header=0)
            a_tmp = pd.read_csv(f'/mnt/ncol-prefix/capture201108{i}_ncol_prefix.ncol', sep=" ", header=None, names=["source", "target", "weight"])
            
            # Create dictionaries that identify each node and label with an integer
            class_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["label"].unique()))}
            node_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["node"].unique()))}
            
            # Change node names and label for their corresponding integer
            x_tmp["node"] = x_tmp["node"].apply(lambda name: node_idx[name])
            x_tmp["label"] = x_tmp["label"].apply(lambda value: class_idx[value])
            a_tmp["source"] = a_tmp["source"].apply(lambda name: node_idx[name])
            a_tmp["target"] = a_tmp["target"].apply(lambda name: node_idx[name])
            
            # Node features:
            x = x_tmp.sort_values("node")[x_tmp.columns.difference(["node","label"], sort=False)].to_numpy()       
            x.astype(np.float32)                
            
            # Separate source, target and weight to create a sparce matrix
            a_source = a_tmp[["source"]].to_numpy().T
            a_source = np.reshape(a_source, a_source.shape[-1])
            a_target = a_tmp[["target"]].to_numpy().T
            a_target = np.reshape(a_target, a_target.shape[-1])
            a_weight = a_tmp[["weight"]].to_numpy().T
            a_weight = np.reshape(a_weight, a_weight.shape[-1])
            # Adjacency matrix:
            a = sparse.coo_matrix((a_weight, (a_source, a_target)), shape=(x.shape[0], x.shape[0]))

            # Label:
            y = x_tmp.sort_values("node")["label"].to_numpy()
            y.astype(np.int64)

            # Save in format npz
            filename = os.path.join(self.path, f'graph_201108{i}.npz')
            np.savez(filename, x=x, a=a, y=y)

            # Free memory
            del x_tmp, x, a_tmp, a_source, a_target, a_weight, a, y
            gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []
        
        captures = ["10","11","12","15","15-2","16","16-2","16-3","17","18","18-2","19","15-3"]

        for i in captures:
            data = np.load(os.path.join(self.path, f'graph_201108{i}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data['x'], a=data['a'][()], y=data['y']) # también puede ser a=data['a'].item()
            )

        return output


# Cargamos el dataset

Separamos en train, validation, test

In [37]:
dataset = CTU13()

In [38]:
# capture number 9 (capture20110817) is for testing
dataset_test = dataset[8]

# other captures are for training
np.random.seed(123)
dataset_tosplit = dataset[np.random.choice([0,1,2,3,4,5,6,7,9,10,11,12], 12, replace=False)]

# split in training and validation
split = int(0.8 * len(dataset_tosplit))
dataset_train, dataset_val = dataset_tosplit[:split], dataset_tosplit[split:]

In [39]:
np.random.seed(123)
np.random.choice([0,1,2,3,4,5,6,7,9,10,11,12], 12, replace=False)

array([ 5,  0,  4, 10,  9,  7, 11,  3,  1,  6, 12,  2])

In [40]:
dataset_train

CTU13(n_graphs=9)

In [41]:
dataset_val # 20110816-2, 20110815-3, 20110812

CTU13(n_graphs=3)

# Data loaders

In [42]:
# Data loaders
batch_size = 32
epochs = 2
loader_train = DisjointLoader(dataset_train, node_level=True, batch_size=batch_size, epochs=epochs, shuffle=True)
loader_val = DisjointLoader(dataset_val, node_level=True, batch_size=batch_size)
loader_test = DisjointLoader(dataset_test, node_level=True, batch_size=batch_size)


# Copia del código de Harpo
Lo que sigue es copia directa (o sea, sin pensar) de lo que subió Harpo en https://github.com/harpomaxx/graph-representation-learning/blob/harpo-branch-pkts/code/python/notebooks/spektral-example.ipynb

Da error, lo próximo que voy a analizar es ver si ese error tiene que ver con que sean problemas diferentes (en el ejemplo de Harpo el objetivo es clasificar grafos, en el problema de CTU13 el objetivo es clasificar nodos).

El error dice "ValueError: Shapes (None, 106580) and (None, 605195) are incompatible". 

A tener en cuenta: capture20110816 tiene 106580 nodos, mientras que capture20110810 tiene 604195 nodos

In [30]:
def create_gcn_model():
    # Define input placeholders for node features, adjacency matrix, and segment indices
    X_in = Input(shape=(dataset.n_node_features,))
    A_in = Input((None,), sparse=True)
    I_in = Input(shape=(), dtype=tf.int32)

    # Apply the first GINConv layer with 32 units and ReLU activation
    X_1 = GINConv(32, activation="relu")([X_in, A_in])
    # Apply dropout with a rate of 0.5
    X_1 = Dropout(0.5)(X_1)

    # Apply the second GINConv layer with 32 units and ReLU activation
    X_2 = GINConv(32, activation="relu")([X_1, A_in])
    # Apply dropout with a rate of 0.5
    X_2 = Dropout(0.5)(X_2)

    # Aggregate the node features using the segment_mean function and the segment indices
    X_3 = tf.math.segment_mean(X_2, I_in)
    # Apply a dense output layer with the number of labels and softmax activation
    out = Dense(dataset.n_labels, activation="softmax")(X_3)

    # Create and return the model with the defined inputs and outputs
    model = Model(inputs=[X_in, A_in, I_in], outputs=out)
    return model




In [31]:
model = create_gcn_model()
optimizer = Adam(lr=0.01)
loss_fn = CategoricalCrossentropy()

  super(Adam, self).__init__(name, **kwargs)


In [32]:
# Decorate the function with @tf.function to compile as a TensorFlow graph
# Use the input_signature from loader_train and relax shapes for varying graph sizes
@tf.function(input_signature=loader_train.tf_signature(), experimental_relax_shapes=True)
def train_step(inputs, target):
    # Create a GradientTape context to record operations for automatic differentiation
    with tf.GradientTape() as tape:
        # Compute model predictions with the inputs, set training=True for training-specific behaviors
        predictions = model(inputs, training=True)
        # Calculate the loss using the provided loss_fn and add the model's regularization losses
        loss = loss_fn(target, predictions) + sum(model.losses)

    # Compute gradients of the loss with respect to the model's trainable variables
    gradients = tape.gradient(loss, model.trainable_variables)
    # Apply the gradients to the model's variables using the optimizer's apply_gradients method
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Compute the accuracy using the categorical_accuracy function from TensorFlow
    # Calculate the mean accuracy using tf.reduce_mean
    acc = tf.reduce_mean(categorical_accuracy(target, predictions))

    # Return the loss and accuracy as output
    return loss, acc


In [33]:
def evaluate(loader):
    output = []
    step = 0
    while step < loader.steps_per_epoch:
        step += 1
        inputs, target = loader.__next__()
        pred = model(inputs, training=False)
        outs = (
            loss_fn(target, pred),
            tf.reduce_mean(categorical_accuracy(target, pred)),
            len(target),  # Keep track of batch size
        )
        output.append(outs)
        if step == loader.steps_per_epoch:
            output = np.array(output)
            return np.average(output[:, :-1], 0, weights=output[:, -1])


In [34]:
# Initialize the epoch and step counters to -1
# Create an empty list for storing training results
epoch = step = -1
results = []

# Iterate through the batches in the loader_train data loader
for batch in loader_train:
    # Increment the step counter
    step += 1

    # Execute the train_step function with the current batch
    # Obtain the loss and accuracy
    loss, acc = train_step(*batch)

    # Append the loss and accuracy to the results list
    results.append((loss, acc))

    # Check if the current step is equal to the number of steps per epoch (loader_train.steps_per_epoch)
    if step == loader_train.steps_per_epoch:
        # Reset the step counter to 0
        # Increment the epoch counter
        step = 0
        epoch += 1

        # Evaluate the model on the test set using the evaluate function (which should be defined beforehand)
        # Store the test results in results_te
        results_te = evaluate(loader_test)

        # Print the epoch number, mean training loss and accuracy, and test loss and accuracy
        print(
            "Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Test loss: {:.3f} - Test acc: {:.3f}".format(
                epoch, *np.mean(results, 0), *results_te
            )
        )

        # Reset the results list to start collecting results for the next epoch
        results = []


  np.random.shuffle(a)


ValueError: in user code:

    File "<ipython-input-32-5cb04d2d4fb9>", line 10, in train_step  *
        loss = loss_fn(target, predictions) + sum(model.losses)
    File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 139, in __call__  **
        losses = call_fn(y_true, y_pred)
    File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 243, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 1787, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/usr/local/lib/python3.8/dist-packages/keras/backend.py", line 5119, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 106580) and (None, 605195) are incompatible
