### Loading the libraries

In [1]:
import io
import json
import math
import warnings
from pathlib import Path

import compress_pickle as pickle
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold
from spektral.data import Dataset, Graph
from spektral.transforms.normalize_adj import NormalizeAdj

# Disable warnings
warnings.filterwarnings("ignore")

2023-02-07 10:34:06.436896: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-07 10:34:06.570541: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-07 10:34:07.104990: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-07 10:34:07.105040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

### Setting the model hyperparameters

In [2]:
# Set random seed for reproducibility
seed = 42
np.random.seed(seed)

### Loading the dataset in spektral data format

In [3]:
# Don't change this
remake_dataset = True

# Select the dataset to use out of the following: p, ds, cs
dataset_id = "p"
data_dir = Path(f'../data/communication_networks/{dataset_id}')

# Load the datasets
users_df = pd.read_csv(data_dir / 'metadata/users.csv')
questions_df = pd.read_csv(data_dir / 'metadata/questions.csv')
answers_df = pd.read_csv(data_dir / 'metadata/answers.csv')
comments_df = pd.read_csv(data_dir / 'metadata/comments.csv')

In [4]:
# Load the embeddings of the nodes
def get_embeddings(node):
    node_type = [0.0, 0.0, 0.0, 0.0]
    if node[0:2] == 'q_':
        embd = questions_df[questions_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[0] = 1.0
    elif node[0:2] == 'a_':
        embd = answers_df[answers_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[1] = 1.0
    elif node[0:2] == 'c_':
        embd = comments_df[comments_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[2] = 1.0
    elif node[0:2] == 'u_':
        embd = users_df[users_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[3] = 1.0

        # Some users have no records in the users table
        # Use the embeddings of '' as a placeholder
        if len(embd) == 0:
            embd = users_df[users_df['Id'] == 8]['embeddings'].values

    return json.loads(embd[0])


# Create a custom dataset for spektral
class CustomDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def read(self):

        def make_graphs():
            graphs = []
            labels = []

            for counter, graph_fname in enumerate((data_dir / 'graphs').glob('*.csv')):

                try:
                    nodes = set()
                    edges = set()
                    q_id = int(graph_fname.parts[-1].split('.')[0][1:])

                    if math.isnan(questions_df[questions_df['Id'] == q_id]['AcceptedAnswerId'].values[0]):
                        label = [1, 0]
                        labels.append(0)
                    else:
                        labels.append(1)
                        label = [0, 1]

                    with io.open(graph_fname, 'r') as f:
                        for line in f:
                            a, b = line.strip().split(',')
                            nodes.add(a)
                            nodes.add(b)
                            edges.add((a, b))

                    nodes = list(nodes)
                    encoded_nodes = dict()
                    for i in range(len(nodes)):
                        encoded_nodes[nodes[i]] = i

                    encoded_edges = []
                    for e in edges:
                        encoded_edges.append((encoded_nodes[e[0]], encoded_nodes[e[1]]))

                    node_features = []
                    for node in nodes:
                        node_features.append(get_embeddings(node))

                    node_features = np.array(node_features)

                    nodes = [encoded_nodes[n] for n in nodes]

                    G = nx.Graph()
                    G.add_nodes_from(nodes)
                    G.add_edges_from(edges)

                    spektral_graph = Graph(x=node_features, a=nx.adjacency_matrix(G, nodelist=nodes), y=label)

                except Exception as e:
                    print(e)
                    # print(traceback.format_exc())
                    continue
                else:
                    graphs.append(spektral_graph)
                finally:
                    if counter > 1000:
                        # break
                        pass

            p1labels = sum(labels) / len(labels)
            p0labels = 1 - p1labels

            print(f"0: {p0labels}; 1: {p1labels}")

            return graphs

        return make_graphs()


# If the remake_dataset flag is set to True, the dataset is created from scratch and saved in a pickle file otherwise it is loaded from the pickle file
if remake_dataset:
    data = CustomDataset(transforms=NormalizeAdj())
    with io.open(data_dir / 'spektral/data.pkl', 'wb') as f:
        pickle.dump(data, f)
else:
    with io.open(data_dir / 'spektral/data.pkl', 'rb') as f:
        data = pickle.load(f)

0: 0.48485615456002695; 1: 0.515143845439973


### Train and evaluate the model

In [5]:
# Create the X and y arrays
X = data

determine_label = lambda x: 0 if x == [1, 0] else 1
y = [determine_label(i.y) for i in data]


# Print the class ratio
def print_class_ratio(y, ds_name="full"):
    l1 = sum(y) / len(y)
    l0 = 1 - l1
    print(f"Dataset: {ds_name}; 0: {l0:.2f}; 1: {l1:.2f}")


print_class_ratio(y)

results = []

# Use stratified k-fold cross validation to evaluate the model on the dataset
skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for i, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")

    X_train = [i.x[0] for i in data[train_index]]
    y_train = [determine_label(i.y) for i in data[train_index]]
    print_class_ratio(y_train, ds_name="train")

    X_val = [i.x[0] for i in data[val_index]]
    y_val = [determine_label(i.y) for i in data[val_index]]
    print_class_ratio(y_val, ds_name="val")

    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)

    acc = clf.score(X_val, y_val)

    y_hat = clf.predict(X_val)

    report = classification_report(y_val, y_hat)
    print(report)

    rec = recall_score(y_val, y_hat)
    prec = precision_score(y_val, y_hat)
    f1 = f1_score(y_val, y_hat)

    results.append((acc, rec, prec, f1))

    print(f"Accuracy on validation: {acc}")

    print(100 * "==")

Dataset: full; 0: 0.48; 1: 0.52
Fold 0:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.48; 1: 0.52
              precision    recall  f1-score   support

           0       0.51      0.53      0.52      1149
           1       0.54      0.52      0.53      1222

    accuracy                           0.52      2371
   macro avg       0.52      0.52      0.52      2371
weighted avg       0.52      0.52      0.52      2371

Accuracy on validation: 0.5221425558835934
Fold 1:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.49; 1: 0.51
              precision    recall  f1-score   support

           0       0.53      0.54      0.54      1150
           1       0.56      0.56      0.56      1221

    accuracy                           0.55      2371
   macro avg       0.55      0.55      0.55      2371
weighted avg       0.55      0.55      0.55      2371

Accuracy on validation: 0.5474483340362716
Fold 2:
Dataset: train; 0: 0.48; 1: 0.52
Dataset: val; 0: 0.49; 1: 0.51
            

### Saving the experiment results

In [6]:
# Save the results in a dataframe
results_dic = {"dataset": dataset_id, "acc": [i[0] for i in results], "rec": [i[1] for i in results],
               "prec": [i[2] for i in results], "f1": [i[3] for i in results], "method": "LogReg",
               "fold": [i for i in range(1, 6)]}

results_df = pd.DataFrame(results_dic)
results_df

Unnamed: 0,dataset,acc,rec,prec,f1,method,fold
0,p,0.522143,0.515548,0.538002,0.526536,LogReg,1
1,p,0.547448,0.556921,0.561056,0.558981,LogReg,2
2,p,0.530156,0.545455,0.543673,0.544563,LogReg,3
3,p,0.518987,0.464373,0.538462,0.498681,LogReg,4
4,p,0.522785,0.515971,0.538462,0.526976,LogReg,5


In [7]:
# Save the results into the clipboard
results_df.transpose().to_clipboard()