In [1]:
import glob
import os
import shutil
from os import path as osp
from urllib.error import URLError

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import tensorflow as tf
#from tensorflow.keras.layers import Dense, Dropout,Input
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

#from spektral.layers import GINConv,GCNConv #, GCSConv, GlobalAvgPool
from spektral.data import Dataset, Graph, DisjointLoader
from spektral.datasets.utils import download_file
from spektral.utils import io, sparse
from spektral.models.gcn import GCN


####################

## Exploración de los datos de PROTEINS

(Ignorar hasta el siguiente título)

In [19]:
name = "PROTEINS"
path = "/root/spektral/datasets/TUDataset/PROTEINS"
fname_template = osp.join(path, "{}_{{}}.txt".format(name))
available = [
            f.split(os.sep)[-1][len(name) + 1 : -4]  # Remove leading name
            for f in glob.glob(fname_template.format("*"))
]

In [20]:
available

['graph_indicator', 'node_attributes', 'A', 'node_labels', 'graph_labels']

In [21]:
# Batch index
node_batch_index = (
            io.load_txt(fname_template.format("graph_indicator")).astype(int) - 1
)
n_nodes = np.bincount(node_batch_index)
n_nodes_cum = np.concatenate(([0], np.cumsum(n_nodes)[:-1]))

In [22]:
# Read edge lists
edges = io.load_txt(fname_template.format("A"), delimiter=",").astype(int) - 1
# Remove duplicates and self-loops from edges
_, mask = np.unique(edges, axis=0, return_index=True)
mask = mask[edges[mask, 0] != edges[mask, 1]]
edges = edges[mask]
# Split edges into separate edge lists
edge_batch_idx = node_batch_index[edges[:, 0]]
n_edges = np.bincount(edge_batch_idx)
n_edges_cum = np.cumsum(n_edges[:-1])
el_list = np.split(edges - n_nodes_cum[edge_batch_idx, None], n_edges_cum)


In [23]:
def _normalize(x, norm=None):
    """
    Apply one-hot encoding or z-score to a list of node features
    """
    if norm == "ohe":
        fnorm = OneHotEncoder(sparse=False, categories="auto")
    elif norm == "zscore":
        fnorm = StandardScaler()
    else:
        return x
    return fnorm.fit_transform(x)


In [24]:
# Node features
x_list = []
if "node_attributes" in available:
    x_attr = io.load_txt(fname_template.format("node_attributes"), delimiter=",")
    if x_attr.ndim == 1:
        x_attr = x_attr[:, None]
    x_list.append(x_attr)
#if "node_labels" in available:
#    x_labs = io.load_txt(fname_template.format("node_labels"))
#    if x_labs.ndim == 1:
#        x_labs = x_labs[:, None]
#    x_labs = np.concatenate([_normalize(xl_[:, None], "ohe") for xl_ in x_labs.T], -1)
#    x_list.append(x_labs)
if len(x_list) > 0:
    x_list = np.concatenate(x_list, -1)
    x_list = np.split(x_list, n_nodes_cum[1:])
else:
    print(
            "WARNING: this dataset doesn't have node attributes."
            "Consider creating manual features before using it with a "
            "Loader."
        )
    x_list = [None] * len(n_nodes)


In [25]:
len(x_list)

1113

In [26]:
# Edge features
e_list = []
if "edge_attributes" in available:
    e_attr = io.load_txt(fname_template.format("edge_attributes"))
    if e_attr.ndim == 1:
        e_attr = e_attr[:, None]
    e_attr = e_attr[mask]
    e_list.append(e_attr)
if "edge_labels" in available:
    e_labs = io.load_txt(fname_template.format("edge_labels"))
    if e_labs.ndim == 1:
        e_labs = e_labs[:, None]
    e_labs = e_labs[mask]
    e_labs = np.concatenate(
        [_normalize(el_[:, None], "ohe") for el_ in e_labs.T], -1
    )
    e_list.append(e_labs)
if len(e_list) > 0:
    e_available = True
    e_list = np.concatenate(e_list, -1)
    e_list = np.split(e_list, n_edges_cum)
else:
    e_available = False
    e_list = [None] * len(n_nodes)

# Create sparse adjacency matrices and re-sort edge attributes in lexicographic
# order
a_e_list = [
    sparse.edge_index_to_matrix(
        edge_index=el,
        edge_weight=np.ones(el.shape[0]),
        edge_features=e,
        shape=(n, n),
    )
    for el, e, n in zip(el_list, e_list, n_nodes)
]
if e_available:
    a_list, e_list = list(zip(*a_e_list))
else:
    a_list = a_e_list


In [27]:
len(a_list)

1113

In [15]:
# Labels
if "graph_attributes" in available:
    labels = io.load_txt(fname_template.format("graph_attributes"))
elif "graph_labels" in available:
    labels = io.load_txt(fname_template.format("graph_labels"))
    labels = _normalize(labels[:, None], "ohe")
else:
    raise ValueError("No labels available for dataset {}".format(self.name))




In [17]:
len(labels), labels.shape

(1113, (1113, 2))

In [28]:
# Labels
labels = []
if "node_labels" in available:
    x_labs = io.load_txt(fname_template.format("node_labels"))
    if x_labs.ndim == 1:
        x_labs = x_labs[:, None]
    x_labs = np.concatenate([_normalize(xl_[:, None], "ohe") for xl_ in x_labs.T], -1)
    labels.append(x_labs)
if len(labels) > 0:
    labels = np.concatenate(labels, -1)
    labels = np.split(labels, n_nodes_cum[1:])
    
len(labels)



1113

In [29]:
labels[0].shape

(42, 3)

In [12]:
if "node_labels" in available:
    x_labs = io.load_txt(fname_template.format("node_labels"))
    if x_labs.ndim == 1:
        x_labs = x_labs[:, None]
    x_labs = np.concatenate([_normalize(xl_[:, None], "ohe") for xl_ in x_labs.T], -1)




In [13]:
x_labs.shape

(43471, 3)

In [14]:
if "node_labels" in available:
    labels = io.load_txt(fname_template.format("node_labels"))
    labels = _normalize(labels[:, None], "ohe")

labels.shape



(43471, 3)

In [35]:
final=[Graph(x=x, a=a, e=e, y=y) for x, a, e, y in zip(x_list, a_list, e_list, labels)]

In [42]:
final[0].y.shape

(42, 3)

# Adaptación de PROTEINS a clasificación de nodos

Defino una clase nueva `MisProteinas` que tiene una clase por nodo, en lugar de una clase por grafo.

In [2]:
def _normalize(x, norm=None):
    """
    Apply one-hot encoding or z-score to a list of node features
    """
    if norm == "ohe":
        fnorm = OneHotEncoder(sparse=False, categories="auto")
    elif norm == "zscore":
        fnorm = StandardScaler()
    else:
        return x
    return fnorm.fit_transform(x)


In [3]:
class MisProteinas(Dataset):
    """
    Adaptado de https://github.com/danielegrattarola/spektral/blob/master/spektral/datasets/tudataset.py
    
    The Benchmark Data Sets for Graph Kernels from TU Dortmund
    ([link](https://chrsmrrs.github.io/datasets/docs/datasets/)).
    Node features are computed by concatenating the following features for
    each node:
    - node attributes, if available;
    - node labels, if available, one-hot encoded.
    Some datasets might not have node features at all. In this case, attempting
    to use the dataset with a Loader will result in a crash. You can create
    node features using some of the transforms available in `spektral.transforms`
    or you can define your own features by accessing the individual samples in
    the `graph` attribute of the dataset (which is a list of `Graph` objects).
    Edge features are computed by concatenating the following features for
    each node:
    - edge attributes, if available;
    - edge labels, if available, one-hot encoded.
    Graph labels are provided for each dataset.
    Specific details about each individual dataset can be found in
    `~/spektral/datasets/TUDataset/<dataset name>/README.md`, after the dataset
    has been downloaded locally (datasets are downloaded automatically upon
    calling `TUDataset('<dataset name>')` the first time).
    **Arguments**
    - `name`: str, name of the dataset to load (see `TUD.available_datasets`).
    - `clean`: if `True`, rload a version of the dataset with no isomorphic
               graphs.
    """

    url = "https://www.chrsmrrs.com/graphkerneldatasets"
    url_clean = (
        "https://raw.githubusercontent.com/nd7141/graph_datasets/master/datasets"
    )

    def __init__(self, name, clean=False, **kwargs):
        if name not in self.available_datasets():
            raise ValueError(
                "Unknown dataset {}. See {}.available_datasets() for a complete list of"
                "available datasets.".format(name, self.__class__.__name__)
            )
        self.name = name
        self.clean = clean
        super().__init__(**kwargs)

    @property
    def path(self):
        return osp.join(super().path, self.name + ("_clean" if self.clean else ""))

    def download(self):
        print(
            "Downloading {} dataset{}.".format(
                self.name, " (clean)" if self.clean else ""
            )
        )
        url = "{}/{}.zip".format(self.url_clean if self.clean else self.url, self.name)
        download_file(url, self.path, self.name + ".zip")

        # Datasets are zipped in a folder: unpack them
        parent = self.path
        subfolder = osp.join(self.path, self.name)
        for filename in os.listdir(subfolder):
            shutil.move(osp.join(subfolder, filename), osp.join(parent, filename))
        os.rmdir(subfolder)

    def read(self):
        fname_template = osp.join(self.path, "{}_{{}}.txt".format(self.name))
        available = [
            f.split(os.sep)[-1][len(self.name) + 1 : -4]  # Remove leading name
            for f in glob.glob(fname_template.format("*"))
        ]

        # Batch index
        node_batch_index = (
            io.load_txt(fname_template.format("graph_indicator")).astype(int) - 1
        )
        n_nodes = np.bincount(node_batch_index)
        n_nodes_cum = np.concatenate(([0], np.cumsum(n_nodes)[:-1]))

        # Read edge lists
        edges = io.load_txt(fname_template.format("A"), delimiter=",").astype(int) - 1
        # Remove duplicates and self-loops from edges
        _, mask = np.unique(edges, axis=0, return_index=True)
        mask = mask[edges[mask, 0] != edges[mask, 1]]
        edges = edges[mask]
        # Split edges into separate edge lists
        edge_batch_idx = node_batch_index[edges[:, 0]]
        n_edges = np.bincount(edge_batch_idx)
        n_edges_cum = np.cumsum(n_edges[:-1])
        el_list = np.split(edges - n_nodes_cum[edge_batch_idx, None], n_edges_cum)

        # Node features
        x_list = []
        if "node_attributes" in available:
            x_attr = io.load_txt(
                fname_template.format("node_attributes"), delimiter=","
            )
            if x_attr.ndim == 1:
                x_attr = x_attr[:, None]
            x_list.append(x_attr)
        #if "node_labels" in available:
        #    x_labs = io.load_txt(fname_template.format("node_labels"))
        #    if x_labs.ndim == 1:
        #        x_labs = x_labs[:, None]
        #    x_labs = np.concatenate(
        #        [_normalize(xl_[:, None], "ohe") for xl_ in x_labs.T], -1
        #    )
        #    x_list.append(x_labs)
        if len(x_list) > 0:
            x_list = np.concatenate(x_list, -1)
            x_list = np.split(x_list, n_nodes_cum[1:])
        else:
            print(
                "WARNING: this dataset doesn't have node attributes."
                "Consider creating manual features before using it with a "
                "Loader."
            )
            x_list = [None] * len(n_nodes)

        # Edge features
        e_list = []
        if "edge_attributes" in available:
            e_attr = io.load_txt(fname_template.format("edge_attributes"))
            if e_attr.ndim == 1:
                e_attr = e_attr[:, None]
            e_attr = e_attr[mask]
            e_list.append(e_attr)
        if "edge_labels" in available:
            e_labs = io.load_txt(fname_template.format("edge_labels"))
            if e_labs.ndim == 1:
                e_labs = e_labs[:, None]
            e_labs = e_labs[mask]
            e_labs = np.concatenate(
                [_normalize(el_[:, None], "ohe") for el_ in e_labs.T], -1
            )
            e_list.append(e_labs)
        if len(e_list) > 0:
            e_available = True
            e_list = np.concatenate(e_list, -1)
            e_list = np.split(e_list, n_edges_cum)
        else:
            e_available = False
            e_list = [None] * len(n_nodes)

        # Create sparse adjacency matrices and re-sort edge attributes in lexicographic
        # order
        a_e_list = [
            sparse.edge_index_to_matrix(
                edge_index=el,
                edge_weight=np.ones(el.shape[0]),
                edge_features=e,
                shape=(n, n),
            )
            for el, e, n in zip(el_list, e_list, n_nodes)
        ]
        if e_available:
            a_list, e_list = list(zip(*a_e_list))
        else:
            a_list = a_e_list

        # Labels
        labels = []
        if "node_labels" in available:
            x_labs = io.load_txt(fname_template.format("node_labels"))
            if x_labs.ndim == 1:
                x_labs = x_labs[:, None]
            x_labs = np.concatenate([_normalize(xl_[:, None], "ohe") for xl_ in x_labs.T], -1)
            labels.append(x_labs)
        if len(labels) > 0:
            labels = np.concatenate(labels, -1)
            labels = np.split(labels, n_nodes_cum[1:])
        #if "graph_attributes" in available:
        #    labels = io.load_txt(fname_template.format("graph_attributes"))
        #elif "graph_labels" in available:
        #    labels = io.load_txt(fname_template.format("graph_labels"))
        #    labels = _normalize(labels[:, None], "ohe")
        else:
            raise ValueError("No labels available for dataset {}".format(self.name))

        # Convert to Graph
        print("Successfully loaded {}.".format(self.name))
        return [
            Graph(x=x, a=a, e=e, y=y)
            for x, a, e, y in zip(x_list, a_list, e_list, labels)
        ]

    @staticmethod
    def available_datasets():
        url = "https://chrsmrrs.github.io/datasets/docs/datasets/"
        try:
            tables = pd.read_html(url)
            names = []
            for table in tables:
                names.extend(table.Name[1:].values.tolist())
            return names
        except URLError:
            # No internet, don't panic
            print("Could not read URL {}".format(url))
            return []


In [4]:
proteinas=MisProteinas("PROTEINS")

Successfully loaded PROTEINS.




In [6]:
proteinas[0].y.shape

(42, 3)

In [5]:
split = int(0.8 * len(proteinas))
dataset_tosplit, dataset_test = proteinas[:split], proteinas[split:]

split = int(0.8 * len(dataset_tosplit))
dataset_train, dataset_val = dataset_tosplit[:split], dataset_tosplit[split:]

In [6]:

batch_size = 32
loader_train = DisjointLoader(dataset_train, node_level=True,batch_size=batch_size, epochs=10, shuffle=False)
loader_val = DisjointLoader(dataset_val, node_level=True,batch_size=batch_size,shuffle=False)
loader_test = DisjointLoader(dataset_test, node_level=True,batch_size=batch_size,shuffle=False)


In [7]:
model = GCN(n_labels=proteinas.n_labels)
optimizer = Adam(learning_rate=0.01)
loss_fn = CategoricalCrossentropy()

In [8]:
# Decorate the function with @tf.function to compile as a TensorFlow graph
# Use the input_signature from loader_train and relax shapes for varying graph sizes
@tf.function(input_signature=loader_train.tf_signature(), experimental_relax_shapes=True)
def train_step(inputs, target):
    print("target:",str(target))
    # Create a GradientTape context to record operations for automatic differentiation
    with tf.GradientTape() as tape:
        # Compute model predictions with the inputs, set training=True for training-specific behaviors
        predictions = model(inputs, training=True)
        print("pred:",str(predictions))
        #predictions = tf.argmax(predictions1,axis=1)
        # Calculate the loss using the provided loss_fn and add the model's regularization losses
        loss = loss_fn(target, predictions) + sum(model.losses)

    # Compute gradients of the loss with respect to the model's trainable variables
    gradients = tape.gradient(loss, model.trainable_variables)
    # Apply the gradients to the model's variables using the optimizer's apply_gradients method
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Compute the accuracy using the categorical_accuracy function from TensorFlow
    # Calculate the mean accuracy using tf.reduce_mean
    acc = tf.reduce_mean(categorical_accuracy(target, predictions))

    # Return the loss and accuracy as output
    return loss, acc


In [9]:
def evaluate(loader):
    output = []
    step = 0
    while step < loader.steps_per_epoch:
        step += 1
        inputs, target = loader.__next__()
        pred = model(inputs, training=False)
        outs = (
            loss_fn(target, pred),
            tf.reduce_mean(categorical_accuracy(target, pred)),
            len(target),  # Keep track of batch size
        )
        output.append(outs)
        if step == loader.steps_per_epoch:
            output = np.array(output)
            return np.average(output[:, :-1], 0, weights=output[:, -1])


In [10]:
# Initialize the epoch and step counters to -1
# Create an empty list for storing training results
epoch = step = -1
results = []

# Iterate through the batches in the loader_train data loader
for batch in loader_train:
    # Increment the step counter
    step += 1

    # Execute the train_step function with the current batch
    # Obtain the loss and accuracy
    loss, acc = train_step(*batch)

    # Append the loss and accuracy to the results list
    results.append((loss, acc))

    # Check if the current step is equal to the number of steps per epoch (loader_train.steps_per_epoch)
    if step == loader_train.steps_per_epoch:
        # Reset the step counter to 0
        # Increment the epoch counter
        step = 0
        epoch += 1

        # Evaluate the model on the test set using the evaluate function (which should be defined beforehand)
        # Store the test results in results_te
        results_te = evaluate(loader_val) # CAMBIO A loader_val

        # Print the epoch number, mean training loss and accuracy, and test loss and accuracy
        print(
            "Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Val loss: {:.3f} - Val acc: {:.3f}".format(
                epoch, *np.mean(results, 0), *results_te
            )
        )

        # Reset the results list to start collecting results for the next epoch
        results = []


target: Tensor("target:0", shape=(None, 3), dtype=float64)
pred: Tensor("gcn/gcn_conv_1/Softmax:0", shape=(None, 3), dtype=float32)
target: Tensor("target:0", shape=(None, 3), dtype=float64)
pred: Tensor("gcn/gcn_conv_1/Softmax:0", shape=(None, 3), dtype=float32)
Ep. 0 - Loss: 6.310 - Acc: 0.454 - Val loss: 1.047 - Val acc: 0.447
Ep. 1 - Loss: 1.818 - Acc: 0.473 - Val loss: 0.886 - Val acc: 0.539
Ep. 2 - Loss: 1.216 - Acc: 0.474 - Val loss: 0.807 - Val acc: 0.446
Ep. 3 - Loss: 1.035 - Acc: 0.476 - Val loss: 0.776 - Val acc: 0.539
Ep. 4 - Loss: 0.951 - Acc: 0.481 - Val loss: 0.793 - Val acc: 0.446
Ep. 5 - Loss: 0.929 - Acc: 0.490 - Val loss: 0.779 - Val acc: 0.447
Ep. 6 - Loss: 0.883 - Acc: 0.497 - Val loss: 0.787 - Val acc: 0.446
Ep. 7 - Loss: 0.888 - Acc: 0.498 - Val loss: 0.793 - Val acc: 0.446
Ep. 8 - Loss: 0.864 - Acc: 0.490 - Val loss: 0.798 - Val acc: 0.447


In [13]:
loader_test.steps_per_epoch

7

In [11]:
#Predict
for _ in range(loader_test.steps_per_epoch):
    inputs,target = loader_test.__next__()
    y_prediction = model(inputs, training=False)
    y_prediction = np.argmax(y_prediction, axis = 1)
    y_test=np.argmax(target, axis=1)
    #Create confusion matrix and normalizes it over predicted (columns)
    result = tf.math.confusion_matrix(y_test, y_prediction, num_classes=3)
    print(result)

tf.Tensor(
[[309   0   0]
 [347   0   0]
 [  0   0   0]], shape=(3, 3), dtype=int32)
tf.Tensor(
[[292   0   0]
 [346   0   0]
 [ 33   0   0]], shape=(3, 3), dtype=int32)
tf.Tensor(
[[205   0   0]
 [330   0   0]
 [ 18   0   0]], shape=(3, 3), dtype=int32)
tf.Tensor(
[[378   0   0]
 [484   0   0]
 [  0   0   0]], shape=(3, 3), dtype=int32)
tf.Tensor(
[[252   0   0]
 [427   0   0]
 [ 15   0   0]], shape=(3, 3), dtype=int32)
tf.Tensor(
[[230   0   0]
 [321   0   0]
 [ 78   0   0]], shape=(3, 3), dtype=int32)
tf.Tensor(
[[171   0   0]
 [247   0   0]
 [ 78   0   0]], shape=(3, 3), dtype=int32)


In [12]:
suma = []
sumatemp = 0
contador = 0
for i in range(len(dataset_test)):
    sumatemp += dataset_test[i].n_nodes
    contador += 1
    if contador == 32:
        suma.append(sumatemp)
        sumatemp = 0
        contador = 0
    elif i == len(dataset_test)-1:
        suma.append(sumatemp)
        
print(suma)

[656, 671, 553, 862, 694, 629, 496]


## SOBRE LA PREDICCIÓN

Para predecir hay que ejecutar `model(inputs,training=False)`, donde en un paso previo se define `inputs` como `inputs,target = loader_test.__next__()`.

Había intentado predecir con `model.predict(loader_test)`, pero esto NO REALIZA LA PREDICCIÓN. Probablemente se quedaba en un loop infinito o algo así, tratando de "desempaquetar" `loader_test`

En el caso de las proteínas, al hacer `model.predict` lo corté en "653683/Unknown - 7336s 11ms/step", mientras que para CTU13 se quedaba sin memoria mucho antes, no pudiendo calcular ninguna predicción

```
#Predict
y_prediction = model.predict(loader_test)
y_prediction = np.argmax(y_prediction, axis = 1)
y_test=np.argmax(dataset_test[0].y, axis=1)
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_prediction , normalize='pred')
print(result)
```