In [1]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout,Input
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import categorical_accuracy #AUC 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from spektral.layers import GINConv,GCNConv #, GCSConv, GlobalAvgPool
from spektral.utils.sparse import sp_matrix_to_sp_tensor
from spektral.data import DisjointLoader, BatchLoader, Dataset, Graph
from spektral.transforms.normalize_adj import NormalizeAdj
from spektral.transforms.normalize_one import NormalizeOne
from spektral.transforms.normalize_sphere import NormalizeSphere
import gc
import spektral.datasets
from spektral.data import DisjointLoader, BatchLoader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from spektral.models.gcn import GCN 

2023-05-04 02:32:29.313374: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import os
import pandas as pd
import numpy as np
from scipy import sparse
import gc
from spektral.data import Dataset, Graph

# Custom dataset class
class MyDataset(Dataset):

    def __init__(self, graph_feature_files, ncol_files, **kwargs):
        # Store the graph feature and ncol file lists
        self.graph_feature_files = graph_feature_files
        self.ncol_files = ncol_files
        super().__init__(**kwargs)

    def download(self):
        pass

    def read(self):
        # We must return a list of Graph objects
        output = []

        # Iterate through graph_feature_files and ncol_files
        for graph_feature_file, ncol_file in zip(self.graph_feature_files, self.ncol_files):
            # Read graph features
            x_tmp = pd.read_csv(graph_feature_file, sep=",", header=0)

            # Read graph topology
            a_tmp = pd.read_csv(ncol_file, sep=" ", header=None, names=["source", "target", "weight"])

            # Create dictionaries that identify each node and label with an integer
            class_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["label"].unique()))}
            node_idx = {name: idx for idx, name in enumerate(sorted(x_tmp["node"].unique()))}

            # Change node names and label for their corresponding integer
            x_tmp["node"] = x_tmp["node"].apply(lambda name: node_idx[name])
            x_tmp["label"] = x_tmp["label"].apply(lambda value: class_idx[value])
            a_tmp["source"] = a_tmp["source"].apply(lambda name: node_idx[name])
            a_tmp["target"] = a_tmp["target"].apply(lambda name: node_idx[name])

            # Node features:
            x = x_tmp.sort_values("node")[x_tmp.columns.difference(["node","label"], sort=False)].to_numpy()
            x = x.astype(np.float32)

            # Create adjacency matrix from source, target, and weight
            a_source = a_tmp[["source"]].to_numpy().T
            a_source = np.reshape(a_source, a_source.shape[-1])
            a_target = a_tmp[["target"]].to_numpy().T
            a_target = np.reshape(a_target, a_target.shape[-1])
            a_weight = a_tmp[["weight"]].to_numpy().T
            a_weight = np.reshape(a_weight, a_weight.shape[-1])

            # Adjacency matrix:
            #a = sparse.coo_matrix((a_weight, (a_source, a_target)), shape=(x.shape[0], x.shape[0]))
            a = sparse.csr_matrix((a_weight, (a_source, a_target)), shape=(x.shape[0], x.shape[0]), dtype=np.float32)

            # Label:
            y = x_tmp.sort_values("node")["label"].to_numpy()
            y.astype(np.int64)

            # Create a Graph object and add it to the output list
            output.append(Graph(x=x, a=a, y=y))

            # Free memory
            del x_tmp, x, a_tmp, a_source, a_target, a_weight, a, y
            gc.collect()

        return output



In [3]:
#import os
#os.listdir("")
import glob
sorted(glob.glob("/mnt/CEPH/ctu13/features/*"))
sorted(glob.glob("/mnt/CEPH/ctu13/ncol/*"))


['/mnt/CEPH/ctu13/ncol/capture20110810.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110811.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110812.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110815-2.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110815-3.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110815.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110816-2.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110816-3.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110816.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110817.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110818-2.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture20110818.binetflow.labels-positive-weights.ncol',
 '/mnt/CEPH/ctu13/ncol/capture

In [4]:
# Create a list of graph feature files and ncol files
graph_feature_files = sorted(glob.glob("/mnt/CEPH/ctu13/features/*"))
ncol_files = sorted(glob.glob("/mnt/CEPH/ctu13/ncol/*"))

ncol_files
# Instantiate the dataset
dataset = MyDataset(graph_feature_files, ncol_files)

In [14]:
for j in range(13):
    suma = 0
    for i in range(dataset[j].n_nodes):
        if all(dataset[j].y[i] == [0.,1.]):
            suma+=1
    print("cap ",j+1,": infected:",suma, " - normal:",dataset[j].n_nodes-suma, " -- prop:",suma/dataset[j].n_nodes)

cap  1 : infected: 0  - normal: 605195  -- prop: 0.0
cap  2 : infected: 0  - normal: 440574  -- prop: 0.0
cap  3 : infected: 0  - normal: 430265  -- prop: 0.0
cap  4 : infected: 0  - normal: 41399  -- prop: 0.0
cap  5 : infected: 0  - normal: 313678  -- prop: 0.0
cap  6 : infected: 0  - normal: 184901  -- prop: 0.0
cap  7 : infected: 0  - normal: 37943  -- prop: 0.0
cap  8 : infected: 0  - normal: 381450  -- prop: 0.0
cap  9 : infected: 0  - normal: 106580  -- prop: 0.0
cap  10 : infected: 0  - normal: 365794  -- prop: 0.0
cap  11 : infected: 0  - normal: 41712  -- prop: 0.0
cap  12 : infected: 0  - normal: 196686  -- prop: 0.0
cap  13 : infected: 0  - normal: 93830  -- prop: 0.0


In [6]:
split = int(0.8 * len(dataset))
dataset_train, dataset_test = dataset[:split], dataset[split:]

In [7]:
dataset_train[0].a

<605195x605195 sparse matrix of type '<class 'numpy.float32'>'
	with 1234211 stored elements in Compressed Sparse Row format>

In [8]:
batch_size = 1
loader_train = DisjointLoader(dataset_train, node_level= True, batch_size=batch_size, epochs=200, shuffle=False, )
loader_test = DisjointLoader(dataset_test, node_level = True, batch_size=batch_size)

In [9]:
def create_gcn_model():
    # Define input placeholders for node features, adjacency matrix, and segment indices
    X_in = Input(shape=(dataset.n_node_features,))
    A_in = Input((None,), sparse=True)
    I_in = Input(shape=(), dtype=tf.int32)

    # Apply the first GINConv layer with 32 units and ReLU activation
    X_1 = GINConv(32, activation="relu")([X_in, A_in])
    # Apply dropout with a rate of 0.5
    X_1 = Dropout(0.5)(X_1)

    # Apply the second GINConv layer with 32 units and ReLU activation
    X_2 = GINConv(32, activation="relu")([X_1, A_in])
    # Apply dropout with a rate of 0.5
    X_2 = Dropout(0.5)(X_2)

    # Aggregate the node features using the segment_mean function and the segment indices
    X_3 = tf.math.segment_mean(X_2, I_in)
    # Apply a dense output layer with the number of labels and softmax activation
    out = Dense(dataset.n_labels, activation="softmax")(X_3)

    # Create and return the model with the defined inputs and outputs
    model = Model(inputs=[X_in, A_in, I_in], outputs=out)
    return model

In [10]:
#model = create_gcn_model()
model = GCN(n_labels=dataset.n_labels)
optimizer = Adam(lr=0.01)
loss_fn = BinaryCrossentropy()

2023-05-04 02:32:49.860070: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-04 02:32:49.865269: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-04 02:32:49.868927: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [11]:
# Decorate the function with @tf.function to compile as a TensorFlow graph
# Use the input_signature from loader_train and relax shapes for varying graph sizes
#@tf.function(input_signature=loader_train.tf_signature(), experimental_relax_shapes=True)
def train_step(inputs, target):
    # Create a GradientTape context to record operations for automatic differentiation
    with tf.GradientTape() as tape:
        # Compute model predictions with the inputs, set training=True for training-specific behaviors
        predictions = model(inputs, training=True)
        # Calculate the loss using the provided loss_fn and add the model's regularization losses
        loss = loss_fn(target, predictions) + sum(model.losses)

    # Compute gradients of the loss with respect to the model's trainable variables
    gradients = tape.gradient(loss, model.trainable_variables)
    # Apply the gradients to the model's variables using the optimizer's apply_gradients method
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Compute the accuracy using the categorical_accuracy function from TensorFlow
    # Calculate the mean accuracy using tf.reduce_mean
    acc = tf.reduce_mean(categorical_accuracy(target, predictions))

    # Return the loss and accuracy as output
    return loss, acc

In [12]:
def evaluate(loader):
    output = []
    step = 0
    while step < loader.steps_per_epoch:
        step += 1
        inputs, target = loader.__next__()
        pred = model(inputs, training=False)
        outs = (
            loss_fn(target, pred),
            tf.reduce_mean(categorical_accuracy(target, pred)),
            len(target),  # Keep track of batch size
        )
        output.append(outs)
        if step == loader.steps_per_epoch:
            output = np.array(output)
            return np.average(output[:, :-1], 0, weights=output[:, -1])

In [13]:
# Initialize the epoch and step counters to -1
# Create an empty list for storing training results
epoch = step = -1
results = []

# Iterate through the batches in the loader_train data loader
for batch in loader_train:
    #print(batch)
    # Increment the step counter
    step += 1

    # Execute the train_step function with the current batch
    # Obtain the loss and accuracy
    loss, acc = train_step(*batch)

    # Append the loss and accuracy to the results list
    results.append((loss, acc))

    # Check if the current step is equal to the number of steps per epoch (loader_train.steps_per_epoch)
    if step == loader_train.steps_per_epoch:
        # Reset the step counter to 0
        # Increment the epoch counter
        step = 0
        epoch += 1

        # Evaluate the model on the test set using the evaluate function (which should be defined beforehand)
        # Store the test results in results_te
        results_te = evaluate(loader_test)

        # Print the epoch number, mean training loss and accuracy, and test loss and accuracy
        print(
            "Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Test loss: {:.3f} - Test acc: {:.3f}".format(
                epoch, *np.mean(results, 0), *results_te
            )
        )

        # Reset the results list to start collecting results for the next epoch
        results = []

2023-05-04 02:33:01.952204: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.33TiB (rounded to 1465043952128)requested by op MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-05-04 02:33:01.952278: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-05-04 02:33:01.952301: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 10, Chunks in use: 10. 2.5KiB allocated for chunks. 2.5KiB in use in bin. 56B client-requested in use in bin.
2023-05-04 02:33:01.952320: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 1, Chunks in use: 1. 512B allocated for chunks. 512B in use in bin. 448B client-requested in use in bin.
2023-05-04 02:33:01.952339: I tensorflow/tsl/framework/bfc_allocator

ResourceExhaustedError: Exception encountered when calling layer 'gcn_conv_1' (type GCNConv).

{{function_node __wrapped__MatMul_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[605195,605195] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:MatMul]

Call arguments received by layer 'gcn_conv_1' (type GCNConv):
  • inputs=['tf.Tensor(shape=(605195, 16), dtype=float32)', 'SparseTensor(indices=tf.Tensor(\n[[     0 252661]\n [     1  97219]\n [     2  97219]\n ...\n [605191 605194]\n [605192 605194]\n [605193 605194]], shape=(1234211, 2), dtype=int64), values=tf.Tensor([2.239e+03 6.000e+00 2.000e+00 ... 6.000e+00 6.000e+00 1.200e+01], shape=(1234211,), dtype=float32), dense_shape=tf.Tensor([605195 605195], shape=(2,), dtype=int64))']
  • mask=None