### Loading the libraries

In [1]:
import io
import math
import os
import warnings
from pathlib import Path

import compress_pickle as pickle
import networkx as nx
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold
from spektral.data import Dataset, DisjointLoader, Graph
from spektral.layers import GCSConv, GlobalAvgPool
from spektral.transforms.normalize_adj import NormalizeAdj
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Disable GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Disable warnings
warnings.filterwarnings("ignore")

2023-02-07 10:37:20.664831: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-07 10:37:20.724566: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-07 10:37:21.040746: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-07 10:37:21.040780: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

### Setting the model hyperparameters

In [2]:
################################################################################
# Config
################################################################################
learning_rate = 1e-3  # Learning rate
epochs = 400  # Number of training epochs
es_patience = 10  # Patience for early stopping
batch_size = 32  # Batch size
seed = 42  # Fixing random seed for reproducibility
################################################################################

In [3]:
# Set random seed
np.random.seed(seed)

### Loading the dataset in spektral data format

In [4]:
# Don't change this
remake_dataset = True

# Select the dataset to use out of the following: p, ds, cs
dataset_id = "p"
data_dir = Path(f'../data/communication_networks/{dataset_id}')

# Load the datasets
users_df = pd.read_csv(data_dir / 'metadata/users.csv')
questions_df = pd.read_csv(data_dir / 'metadata/questions.csv')
answers_df = pd.read_csv(data_dir / 'metadata/answers.csv')
comments_df = pd.read_csv(data_dir / 'metadata/comments.csv')

In [5]:
# Load the embeddings of the nodes
def get_embeddings(node):
    node_type = [0.0, 0.0, 0.0, 0.0]
    if node[0:2] == 'q_':
        embd = questions_df[questions_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[0] = 1.0
    elif node[0:2] == 'a_':
        embd = answers_df[answers_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[1] = 1.0
    elif node[0:2] == 'c_':
        embd = comments_df[comments_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[2] = 1.0
    elif node[0:2] == 'u_':
        embd = users_df[users_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[3] = 1.0

        # Some users have no records in the users table
        # Use the embeddings of '' as a placeholder
        if len(embd) == 0:
            embd = users_df[users_df['Id'] == 8]['embeddings'].values

    # Change the following line to use different node representations
    #return np.concatenate((json.loads(embd[0]), node_type))
    #return json.loads(embd[0])
    return node_type


# Create a custom dataset for spektral
class CustomDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def read(self):

        def make_graphs():
            graphs = []
            labels = []

            for counter, graph_fname in enumerate((data_dir / 'graphs').glob('*.csv')):

                try:
                    nodes = set()
                    edges = set()
                    q_id = int(graph_fname.parts[-1].split('.')[0][1:])

                    if math.isnan(questions_df[questions_df['Id'] == q_id]['AcceptedAnswerId'].values[0]):
                        label = [1, 0]
                        labels.append(0)
                    else:
                        labels.append(1)
                        label = [0, 1]

                    with io.open(graph_fname, 'r') as f:
                        for line in f:
                            a, b = line.strip().split(',')
                            nodes.add(a)
                            nodes.add(b)
                            edges.add((a, b))

                    nodes = list(nodes)
                    encoded_nodes = dict()
                    for i in range(len(nodes)):
                        encoded_nodes[nodes[i]] = i

                    encoded_edges = []
                    for e in edges:
                        encoded_edges.append((encoded_nodes[e[0]], encoded_nodes[e[1]]))

                    node_features = []
                    for node in nodes:
                        node_features.append(get_embeddings(node))

                    node_features = np.array(node_features)

                    nodes = [encoded_nodes[n] for n in nodes]

                    G = nx.Graph()
                    G.add_nodes_from(nodes)
                    G.add_edges_from(edges)

                    spektral_graph = Graph(x=node_features, a=nx.adjacency_matrix(G, nodelist=nodes), y=label)

                except Exception as e:
                    print(e)
                    # print(traceback.format_exc())
                    continue
                else:
                    graphs.append(spektral_graph)
                finally:
                    if counter > 1000:
                        # break
                        pass

            p1labels = sum(labels) / len(labels)
            p0labels = 1 - p1labels

            print(f"0: {p0labels:.2f}; 1: {p1labels:.2f}")

            return graphs

        return make_graphs()


# If the remake_dataset flag is set to True, the dataset is created from scratch and saved in a pickle file otherwise it is loaded from the pickle file
if remake_dataset:
    data = CustomDataset(transforms=NormalizeAdj())
    with io.open(data_dir / 'spektral/data.pkl', 'wb') as f:
        pickle.dump(data, f)
else:
    with io.open(data_dir / 'spektral/data.pkl', 'rb') as f:
        data = pickle.load(f)

0: 0.48; 1: 0.52


### Train and evaluate the model

In [6]:
# Convert the one-hot encoded labels to integers
determine_label = lambda x: 0 if x == [1, 0] else 1

In [7]:
################################################################################
# Build a graph convolutional neural network model
################################################################################
class Net(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GCSConv(32, activation="relu")
        self.conv2 = GCSConv(32, activation="relu")
        self.conv3 = GCSConv(32, activation="relu")
        self.global_pool = GlobalAvgPool()
        self.dense = Dense(data.n_labels, activation="softmax")

    def call(self, inputs):
        x, a, i = inputs
        x = self.conv1([x, a])
        x = self.conv2([x, a])
        x = self.conv3([x, a])
        output = self.global_pool([x, i])
        output = self.dense(output)

        return output


# Evaluate the model
def evaluate(loader):
    y_real = []
    y_hat = []

    output = []
    step = 0
    while step < loader.steps_per_epoch:
        step += 1
        inputs, target = loader.__next__()
        pred = model(inputs, training=False)

        y_real += [determine_label(list(i)) for i in target]
        y_hat += np.argmax(pred.numpy(), axis=1).tolist()

        outs = (
            loss_fn(target, pred),
            tf.reduce_mean(categorical_accuracy(target, pred)),
            len(target),  # Keep track of batch size
        )
        output.append(outs)
        if step == loader.steps_per_epoch:
            output = np.array(output)
            return np.average(output[:, :-1], 0, weights=output[:, -1]), (
                recall_score(y_real, y_hat), precision_score(y_real, y_hat), f1_score(y_real, y_hat))

In [8]:
# Create the X and y arrays
X = data

y = [determine_label(i.y) for i in data]


# Print the class ratio
def print_class_ratio(y, ds_name="full"):
    l1 = sum(y) / len(y)
    l0 = 1 - l1
    print(f"Dataset: {ds_name}; 0: {l0:.2f}; 1: {l1:.2f}")


print_class_ratio(y)

results = []

# Use stratified k-fold cross validation to evaluate the model on the dataset
skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for i, (train_index, val_index) in enumerate(skf.split(X, y)):

    print(f"Fold {i}:")

    y_train = [determine_label(i.y) for i in data[train_index]]
    print_class_ratio(y_train, ds_name="train")

    y_val = [determine_label(i.y) for i in data[val_index]]
    print_class_ratio(y_val, ds_name="val")

    loader_tr = DisjointLoader(data[train_index], batch_size=batch_size, epochs=epochs)
    loader_va = DisjointLoader(data[val_index], batch_size=batch_size)

    model = Net()
    optimizer = Adam(learning_rate=learning_rate)
    loss_fn = CategoricalCrossentropy()


    ################################################################################
    # Fit model on training set
    ################################################################################
    @tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
    def train_step(inputs, target):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)
            loss = loss_fn(target, predictions) + sum(model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        acc = tf.reduce_mean(categorical_accuracy(target, predictions))
        return loss, acc


    epoch = step = 0
    best_val_loss = np.inf
    best_weights = None
    patience = es_patience
    for batch in loader_tr:
        step += 1

        train_step(*batch)

        if step == loader_tr.steps_per_epoch:
            step = 0
            epoch += 1

            # Compute validation loss and accuracy
            loss_acc, other_metrics = evaluate(loader_va)

            val_loss, val_acc = loss_acc
            val_rec, val_prec, val_f1 = other_metrics

            # Check if loss improved for early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_val_acc = val_acc
                best_val_rec = val_rec
                best_val_prec = val_prec
                best_val_f1 = val_f1
                patience = es_patience
            else:
                patience -= 1
                if patience == 0:
                    print(f"Early stopping after {epoch} epochs with best val_acc: {best_val_acc:.3f};"
                          f" best val_rec: {best_val_rec:.3f}; best val_prec: {best_val_prec:.3f}; best val_f1: {best_val_f1:.3f}")
                    results.append((best_val_acc, best_val_rec, best_val_prec, best_val_f1))
                    break

    print(100 * "==")

Dataset: full; 0: 0.48; 1: 0.52
Fold 0:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.48; 1: 0.52


2023-02-07 10:38:13.157176: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-02-07 10:38:13.157192: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: bfg5
2023-02-07 10:38:13.157196: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: bfg5
2023-02-07 10:38:13.157257: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.108.3
2023-02-07 10:38:13.157269: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.108.3
2023-02-07 10:38:13.157272: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 510.108.3
2023-02-07 10:38:13.157432: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep 

Early stopping after 17 epochs with best val_acc: 0.580; best val_rec: 0.681; best val_prec: 0.579; best val_f1: 0.626
Fold 1:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.49; 1: 0.51
Early stopping after 21 epochs with best val_acc: 0.571; best val_rec: 0.663; best val_prec: 0.572; best val_f1: 0.614
Fold 2:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.49; 1: 0.51
Early stopping after 17 epochs with best val_acc: 0.570; best val_rec: 0.602; best val_prec: 0.580; best val_f1: 0.591
Fold 3:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.48; 1: 0.52
Early stopping after 24 epochs with best val_acc: 0.564; best val_rec: 0.658; best val_prec: 0.566; best val_f1: 0.609
Fold 4:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.48; 1: 0.52
Early stopping after 31 epochs with best val_acc: 0.589; best val_rec: 0.719; best val_prec: 0.582; best val_f1: 0.643


### Saving the experiment results

In [9]:
# Save the results in a dataframe
results_dic = {"dataset": dataset_id, "acc": [i[0] for i in results], "rec": [i[1] for i in results],
               "prec": [i[2] for i in results], "f1": [i[3] for i in results], "method": "GCN",
               "fold": [i for i in range(1, 6)]}

results_df = pd.DataFrame(results_dic)
results_df

Unnamed: 0,dataset,acc,rec,prec,f1,method,fold
0,p,0.579924,0.680851,0.578581,0.625564,GCN,1
1,p,0.571067,0.663391,0.572034,0.614334,GCN,2
2,p,0.570224,0.601966,0.579653,0.590599,GCN,3
3,p,0.564135,0.657658,0.566291,0.608564,GCN,4
4,p,0.589451,0.719083,0.582228,0.643459,GCN,5


In [10]:
# Save the results into the clipboard
results_df.transpose().to_clipboard()