In [None]:
!pip install spektral

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout,Input
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from spektral.layers import GINConv,GCNConv #, GCSConv, GlobalAvgPool
from spektral.utils.sparse import sp_matrix_to_sp_tensor
from spektral.data import DisjointLoader, BatchLoader, Dataset, Graph
#from spektral.transforms.normalize_adj import NormalizeAdj
import gc
import spektral.datasets
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from spektral.models.gcn import GCN

# Creamos clase CTU13

Según https://graphneural.network/creating-dataset/ para crear un dataset propio hay que crear una clase e incluir un método `download` y uno `read`: si al querer leer los datos, no los encuentra en el path correspondiente, entonces los descarga.
En nuestro caso, además, lo que hace la "descarga" es "acomodar" los datos a un formato propio de Spektral.

### UPDATES

**UPDATE 1:** Agrego función que hace un one-hot encoding en los labels

**UPDATE 2:** Agrego links a los archivos con los que genero la clase `CTU13` para que los puedan descargar.

In [2]:
def _normalize(x, norm=None):
        """
        Copied from: https://github.com/danielegrattarola/spektral/blob/master/spektral/datasets/tudataset.py
        
        Apply one-hot encoding or z-score to a list of node features
        """
        if norm == "ohe":
            fnorm = OneHotEncoder(sparse_output=False, categories="auto")
        elif norm == "zscore":
            fnorm = StandardScaler()
        else:
            return x
        return fnorm.fit_transform(x)


In [3]:
class CTU13(Dataset):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def download(self):
        os.mkdir(self.path)
        
        captures = ["10","11","12","15","15-2","16","16-2","16-3","17","18","18-2","19","15-3"]
        
        # para poder descargar los archivos con las features y los ncol de cada captura
        features_links = ["ktAgJS22kBXcMsT", 
                          "qfM8pjZgDi9EFfd",
                          "34PWCn9JmdDA9HE",
                          "YtcCmiNEyPiYB4Y",
                          "D2HdaEZYcFz5ryt",
                          "cTY262moFLGjtmn",
                          "TbfPFbSKWqYqR7n",
                          "s2GTjFz8rNxbs4z",
                          "9dZPAENNACreDEK",
                          "bwR2Zrky49JjtgA",
                          "CmYc9JyBsHwzaYD",
                          "TNSkGJcq2CPoFtM",
                          "XwZFrQYzMLNJxAY"
        ]

        ncol_links = ["B5EBDnAr4z55cc9",
                      "Pz4ba4jn3nCNgAp",
                      "EbkwSBHyAkHmdHE",
                      "ttyoxLc36s7ABCB",
                      "R3b9fe25x6ncoaT",
                      "wFZ72f9kL3XFki6",
                      "7EcYp9ACPqkQqDs",
                      "YcTZCARwKCY2jiB",
                      "3cc8mcTZaEC9LGM",
                      "NDgw4PwXAwQKgb2",
                      "wY38ypkj7QSJYib",
                      "dEZYJ84z53ozZZo",
                      "NKdZfBX6DG9nB8o"            
        ]
        
        for i in range(len(captures)):
            # x = nodes features (ID, OD, IDW, ODW)
            # a = adjacency matrix
            # y =labels
            
            # Read files with nodes features (csv file) and connections between nodes (ncol file)
            x_tmp = pd.read_csv(f'https://nube.ingenieria.uncuyo.edu.ar/s/{features_links[i]}/download', sep=",", header=0)
            a_tmp = pd.read_csv(f'https://nube.ingenieria.uncuyo.edu.ar/s/{ncol_links[i]}/download', sep=" ", header=None, names=["source", "target", "weight"])
            
            # Create dictionaries that identify each node and label with an integer
            class_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["label"].unique()))}
            node_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["node"].unique()))}
            
            # Change node names and label for their corresponding integer
            x_tmp["node"] = x_tmp["node"].apply(lambda name: node_idx[name])
            x_tmp["label"] = x_tmp["label"].apply(lambda value: class_idx[value])
            a_tmp["source"] = a_tmp["source"].apply(lambda name: node_idx[name])
            a_tmp["target"] = a_tmp["target"].apply(lambda name: node_idx[name])
            
            # Node features:
            x = x_tmp.sort_values("node")[x_tmp.columns.difference(["node","label"], sort=False)].to_numpy()       
            x = x.astype(np.float32)                
            
            # Separate source, target and weight to create a sparce matrix
            a_source = a_tmp[["source"]].to_numpy().T
            a_source = np.reshape(a_source, a_source.shape[-1])
            a_target = a_tmp[["target"]].to_numpy().T
            a_target = np.reshape(a_target, a_target.shape[-1])
            a_weight = a_tmp[["weight"]].to_numpy().T
            a_weight = np.reshape(a_weight, a_weight.shape[-1])
            # Adjacency matrix:
            #a = sparse.coo_matrix((a_weight, (a_source, a_target)), shape=(x.shape[0], x.shape[0]))
            a = sparse.csr_matrix((a_weight, (a_source, a_target)), shape=(x.shape[0], x.shape[0]), dtype=np.float32)

            # Label:
            y = x_tmp.sort_values("node")["label"].to_numpy()
            #y = y.astype(np.int64)
            y = y.astype(np.float32)
            y = _normalize(y[:, None], "ohe") #one-hot encoding
            
            # Save in format npz
            filename = os.path.join(self.path, f'graph_201108{captures[i]}.npz')
            np.savez(filename, x=x, a=a, y=y)

            # Free memory
            del x_tmp, x, a_tmp, a_source, a_target, a_weight, a, y
            gc.collect()


    def read(self):
        # We must return a list of Graph objects
        output = []
        
        captures = ["10","11","12","15","15-2","16","16-2","16-3","17","18","18-2","19","15-3"]

        for i in captures:
            data = np.load(os.path.join(self.path, f'graph_201108{i}.npz'), allow_pickle=True)
            output.append(
                Graph(x=data['x'], a=data['a'][()], y=data['y']) # también puede ser a=data['a'].item()
            )

        return output
    
    


# Cargamos el dataset

Separamos en train, validation, test

In [4]:
dataset = CTU13()

In [4]:
## MEZCLAR LAS CAPTURAS CONDUCE A UN ERROR. QUEDA COMENTADO POR EL MOMENTO

## capture number 9 (capture20110817) is for testing
#dataset_test = dataset[8]

## other captures are for training
#np.random.seed(123)
#dataset_tosplit = dataset[np.random.choice([0,1,2,3,4,5,6,7,9,10,11,12], 12, replace=False)]

## split in training and validation
#split = int(0.8 * len(dataset_tosplit))
#dataset_train, dataset_val = dataset_tosplit[:split], dataset_tosplit[split:]

In [5]:
## CAPTURAS SIN MEZCLAR

dataset_test = dataset[8:9] # capture number 9 for testing (20110817)
dataset_test_Tati = dataset[10:11] # capture number 11 for external validation (20110818-2)

dataset_tosplit = dataset[0,1,2,3,4,5,6,7,9,11,12]
split = int(0.8 * len(dataset_tosplit))
dataset_train, dataset_val = dataset_tosplit[:split], dataset_tosplit[split:]

In [6]:
dataset[0]

Graph(n_nodes=605195, n_node_features=4, n_edge_features=None, n_labels=2)

# Data loaders

In [6]:
# Data loaders
batch_size = 1
epochs = 20
loader_train = DisjointLoader(dataset_train, node_level=True, batch_size=batch_size, epochs=epochs, shuffle=False)
loader_val = DisjointLoader(dataset_val, node_level=True, batch_size=batch_size, shuffle=False)
loader_test = DisjointLoader(dataset_test, node_level=True, batch_size=batch_size, shuffle=False)
loader_test_Tati = DisjointLoader(dataset_test_Tati, node_level=True, batch_size=batch_size, shuffle=False)

# Copia del código de Harpo
Lo que sigue es copia directa (o sea, sin pensar) de lo que subió Harpo en https://github.com/harpomaxx/graph-representation-learning/blob/harpo-branch-pkts/code/python/notebooks/spektral-example.ipynb

Da error, lo próximo que voy a analizar es ver si ese error tiene que ver con que sean problemas diferentes (en el ejemplo de Harpo el objetivo es clasificar grafos, en el problema de CTU13 el objetivo es clasificar nodos).

El error dice "ValueError: Shapes (None, 106580) and (None, 605195) are incompatible". 

A tener en cuenta: capture20110816 tiene 106580 nodos, mientras que capture20110810 tiene 604195 nodos

### UPDATES

**UPDATE:** el error sobre `shapes` al parecer se debía a que no le estaba pasando correctamente las etiquetas.
Esperaba un array de forma `[n_nodes,n_labels]`, donde en este caso `n_labels=2` y yo le estaba pasando `n_labels` como si fuera `n_nodes`.

Ahora se desprende otro error de una multiplicación de matrices cuando `batch_size=1`:
```
Node: 'gradient_tape/model/dense/MatMul/MatMul_1'
Matrix size-incompatible: In[0]: [1,32], In[1]: [605195,2]
	 [[{{node gradient_tape/model/dense/MatMul/MatMul_1}}]] [Op:__inference_train_step_1155]
```

Y otro error diferente cuando `batch_size=10`:
```
Node: 'categorical_crossentropy/softmax_cross_entropy_with_logits'
logits and labels must be broadcastable: logits_size=[9,2] labels_size=[2424993,2]
	 [[{{node categorical_crossentropy/softmax_cross_entropy_with_logits}}]] [Op:__inference_train_step_1155]
```

**UPDATE 2:**

El modelo `create_gcn_model` no anda (problemas con las dimensiones... tiene sentido porque la capa densa tiene dos neuronas, y lo que nosotros queremos es que prediga la clase para cada nodo. No sé cómo podría modificarlo para que eso depdenda de cada grafo).

Por el contrario, lo que hice fue ignorar la función `create_gcn_model` y utilizar directamente el modelo `GCN` de spektral, y con eso al menos empezó a entrenar!!

También modifiqué la loss por una `BinaryCrossentropy`.

Ahora hay que ver qué está haciendo el modelo y qué tan bien anda...

In [7]:
## IGNORAR ESTA FUNCION

def create_gcn_model():
    # Define input placeholders for node features, adjacency matrix, and segment indices
    X_in = Input(shape=(dataset.n_node_features,))
    A_in = Input((None,), sparse=True)
    I_in = Input(shape=(), dtype=tf.int32)

    # Apply the first GINConv layer with 32 units and ReLU activation
    X_1 = GINConv(32, activation="relu")([X_in, A_in])
    # Apply dropout with a rate of 0.5
    X_1 = Dropout(0.5)(X_1)
    print(X_1.shape)

    # Apply the second GINConv layer with 32 units and ReLU activation
    X_2 = GINConv(32, activation="relu")([X_1, A_in])
    # Apply dropout with a rate of 0.5
    X_2 = Dropout(0.5)(X_2)
    print(X_2.shape)

    # Aggregate the node features using the segment_mean function and the segment indices
    X_3 = tf.math.segment_mean(X_2, I_in)
    print(X_3.shape)
    # Apply a dense output layer with the number of labels and softmax activation
    out = Dense(dataset.n_labels, activation="softmax")(X_3)
    print(out.shape)

    # Create and return the model with the defined inputs and outputs
    model = Model(inputs=[X_in, A_in, I_in], outputs=out)
    return model


In [7]:
#model = create_gcn_model()
model = GCN(n_labels=dataset.n_labels)
optimizer = Adam(learning_rate=0.01)
#loss_fn = CategoricalCrossentropy()
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)

In [8]:
# Decorate the function with @tf.function to compile as a TensorFlow graph
# Use the input_signature from loader_train and relax shapes for varying graph sizes
@tf.function(input_signature=loader_train.tf_signature(), experimental_relax_shapes=True)
def train_step(inputs, target):
    print("target:",str(target))
    # Create a GradientTape context to record operations for automatic differentiation
    with tf.GradientTape() as tape:
        # Compute model predictions with the inputs, set training=True for training-specific behaviors
        predictions = model(inputs, training=True)
        print("pred:",str(predictions))
        #predictions = tf.argmax(predictions1,axis=1)
        # Calculate the loss using the provided loss_fn and add the model's regularization losses
        loss = loss_fn(target, predictions) + sum(model.losses)

    # Compute gradients of the loss with respect to the model's trainable variables
    gradients = tape.gradient(loss, model.trainable_variables)
    # Apply the gradients to the model's variables using the optimizer's apply_gradients method
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Compute the accuracy using the categorical_accuracy function from TensorFlow
    # Calculate the mean accuracy using tf.reduce_mean
    acc = tf.reduce_mean(categorical_accuracy(target, predictions))

    # Return the loss and accuracy as output
    return loss, acc


In [9]:
def evaluate(loader):
    output = []
    step = 0
    while step < loader.steps_per_epoch:
        step += 1
        inputs, target = loader.__next__()
        pred = model(inputs, training=False)
        outs = (
            loss_fn(target, pred),
            tf.reduce_mean(categorical_accuracy(target, pred)),
            len(target),  # Keep track of batch size
        )
        output.append(outs)
        if step == loader.steps_per_epoch:
            output = np.array(output)
            return np.average(output[:, :-1], 0, weights=output[:, -1])


In [10]:
# Initialize the epoch and step counters to -1
# Create an empty list for storing training results
epoch = step = -1
results = []

# Iterate through the batches in the loader_train data loader
for batch in loader_train:
    # Increment the step counter
    step += 1

    # Execute the train_step function with the current batch
    # Obtain the loss and accuracy
    loss, acc = train_step(*batch)

    # Append the loss and accuracy to the results list
    results.append((loss, acc))

    # Check if the current step is equal to the number of steps per epoch (loader_train.steps_per_epoch)
    if step == loader_train.steps_per_epoch:
        # Reset the step counter to 0
        # Increment the epoch counter
        step = 0
        epoch += 1

        # Evaluate the model on the test set using the evaluate function (which should be defined beforehand)
        # Store the test results in results_te
        results_te = evaluate(loader_val) # CAMBIO A loader_val

        # Print the epoch number, mean training loss and accuracy, and test loss and accuracy
        print(
            "Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Val loss: {:.3f} - Val acc: {:.3f}".format(
                epoch, *np.mean(results, 0), *results_te
            )
        )

        # Reset the results list to start collecting results for the next epoch
        results = []


target: Tensor("target:0", shape=(None, 2), dtype=float64)
pred: Tensor("gcn/gcn_conv_1/Softmax:0", shape=(None, 2), dtype=float32)
target: Tensor("target:0", shape=(None, 2), dtype=float64)
pred: Tensor("gcn/gcn_conv_1/Softmax:0", shape=(None, 2), dtype=float32)
Ep. 0 - Loss: 39539162742784.000 - Acc: 0.826 - Val loss: 18190060853.425 - Val acc: 0.935
Ep. 1 - Loss: 18862703640576.000 - Acc: 0.921 - Val loss: 4849618959.761 - Val acc: 0.935
Ep. 2 - Loss: 20883594805248.000 - Acc: 0.914 - Val loss: 5185095309.049 - Val acc: 0.935
Ep. 3 - Loss: 2807744692224.000 - Acc: 0.921 - Val loss: 5304629845.518 - Val acc: 0.935
Ep. 4 - Loss: 17491372802048.000 - Acc: 0.924 - Val loss: 5221296716.466 - Val acc: 0.935
Ep. 5 - Loss: 1316098670592.000 - Acc: 0.923 - Val loss: 5664719626.187 - Val acc: 0.935
Ep. 6 - Loss: 19791110733824.000 - Acc: 0.923 - Val loss: 6471750873.097 - Val acc: 0.935
Ep. 7 - Loss: 2389053800448.000 - Acc: 0.923 - Val loss: 7143394094.935 - Val acc: 0.935
Ep. 8 - Loss: 4417