### Loading the libraries

In [1]:
import io
import json
import math
import warnings
from pathlib import Path

import compress_pickle as pickle
import networkx as nx
import numpy as np
import pandas as pd
from datasets import Dataset as HFDataset
from datasets import load_metric
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer, SetFitModel
from setfit import sample_dataset
from sklearn.model_selection import StratifiedKFold
from spektral.data import Graph, Dataset
from spektral.transforms.normalize_adj import NormalizeAdj

# Disable warnings
warnings.filterwarnings("ignore")

2023-02-07 10:12:33.135893: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-07 10:12:33.205808: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-07 10:12:33.544938: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-07 10:12:33.544989: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

### Setting the model hyperparameters

In [2]:
# Set the random seed for reproducibility
seed = 42
np.random.seed(seed)

# Select the dataset to use out of the following: p, ds, cs
dataset_id = "cs"
data_dir = Path(f'../data/communication_networks/{dataset_id}')

# Don't change this
remake_dataset = True

# Maximum sequence length for the sentence transformer model
max_length = 256

# Number of samples per class for the few-shot learning
num_shots = 20

### Loading the dataset in spektral data format

In [3]:
# Load the datasets
users_df = pd.read_csv(data_dir / 'metadata/users.csv')
questions_df = pd.read_csv(data_dir / 'metadata/questions.csv')
answers_df = pd.read_csv(data_dir / 'metadata/answers.csv')
comments_df = pd.read_csv(data_dir / 'metadata/comments.csv')

In [4]:
# Load the embeddings of the nodes
def get_embeddings(node):
    node_type = [0.0, 0.0, 0.0, 0.0]
    if node[0:2] == 'q_':
        embd = questions_df[questions_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[0] = 1.0
    elif node[0:2] == 'a_':
        embd = answers_df[answers_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[1] = 1.0
    elif node[0:2] == 'c_':
        embd = comments_df[comments_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[2] = 1.0
    elif node[0:2] == 'u_':
        embd = users_df[users_df['Id'] == int(node[2:])]['embeddings'].values
        node_type[3] = 1.0

        # Some users have no records in the users table
        # Use the embeddings of '' as a placeholder
        if len(embd) == 0:
            embd = users_df[users_df['Id'] == 8]['embeddings'].values

    return json.loads(embd[0])


# Create a custom dataset for spektral
class CustomDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    q_ids = []

    def read(self):

        def make_graphs():
            graphs = []
            labels = []

            for counter, graph_fname in enumerate((data_dir / 'graphs').glob('*.csv')):

                try:
                    nodes = set()
                    edges = set()
                    q_id = int(graph_fname.parts[-1].split('.')[0][1:])

                    self.q_ids.append(q_id)

                    if math.isnan(questions_df[questions_df['Id'] == q_id]['AcceptedAnswerId'].values[0]):
                        label = [1, 0]
                        labels.append(0)
                    else:
                        labels.append(1)
                        label = [0, 1]

                    with io.open(graph_fname, 'r') as f:
                        for line in f:
                            a, b = line.strip().split(',')
                            nodes.add(a)
                            nodes.add(b)
                            edges.add((a, b))

                    nodes = list(nodes)
                    encoded_nodes = dict()
                    for i in range(len(nodes)):
                        encoded_nodes[nodes[i]] = i

                    encoded_edges = []
                    for e in edges:
                        encoded_edges.append((encoded_nodes[e[0]], encoded_nodes[e[1]]))

                    node_features = []
                    for node in nodes:
                        node_features.append(get_embeddings(node))

                    node_features = np.array(node_features)

                    nodes = [encoded_nodes[n] for n in nodes]

                    G = nx.Graph()
                    G.add_nodes_from(nodes)
                    G.add_edges_from(edges)

                    spektral_graph = Graph(x=node_features, a=nx.adjacency_matrix(G, nodelist=nodes), y=label)

                except Exception as e:
                    print(e)
                    # print(traceback.format_exc())
                    continue
                else:
                    graphs.append(spektral_graph)
                finally:
                    if counter > 1000:
                        # break
                        pass

            p1labels = sum(labels) / len(labels)
            p0labels = 1 - p1labels

            print(f"0: {p0labels:.2f}; 1: {p1labels:.2f}")

            return graphs

        return make_graphs()


# If the remake_dataset flag is set to True, the dataset is created from scratch and saved in a pickle file otherwise it is loaded from the pickle file
if remake_dataset:
    data = CustomDataset(transforms=NormalizeAdj())
    with io.open(data_dir / f'spektral/data_fshot_{dataset_id}.pkl', 'wb') as f:
        pickle.dump(data, f)
else:
    with io.open(data_dir / f'spektral/data_fshot_{dataset_id}.pkl', 'rb') as f:
        data = pickle.load(f)

0: 0.54; 1: 0.46


### Train and evaluate the model

In [5]:
# Compute the evaluation metrics
def compute_metrics(y_hat, y_real):
    metric0 = load_metric("accuracy")
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")

    predictions, labels = y_real, y_hat

    return list(metric0.compute(predictions=predictions, references=labels).values())[0], \
        list(metric1.compute(predictions=predictions, references=labels).values())[0], \
        list(metric2.compute(predictions=predictions, references=labels).values())[0], \
        list(metric3.compute(predictions=predictions, references=labels).values())[0]

In [6]:
# Create the X and y arrays
X = np.array(data.q_ids)

determine_label = lambda x: 0 if x == [1, 0] else 1
y = [determine_label(i.y) for i in data]

print(len(X), len(y))

# Print the class ratio
def print_class_ratio(y, ds_name="full"):
    l1 = sum(y) / len(y)
    l0 = 1 - l1
    print(f"Dataset: {ds_name}; 0: {l0:.2f}; 1: {l1:.2f}")


print_class_ratio(y)

results = []

# Use stratified k-fold cross validation to evaluate the model on the dataset
skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for i, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")

    X_train = questions_df[questions_df['Id'].isin(X[train_index])]
    X_train["text"] = X_train["text1"].values + ". " + X_train["text2"].values
    X_train["label"] = [determine_label(i.y) for i in data[train_index]]
    print_class_ratio(X_train["label"], ds_name="train")

    X_val = questions_df[questions_df['Id'].isin(X[val_index])]
    X_val["text"] = X_val["text1"].values + ". " + X_val["text2"].values
    X_val["label"] = [determine_label(i.y) for i in data[val_index]]
    print_class_ratio(X_val["label"], ds_name="val")

    # Try to load the model from the cache first then download it from the HuggingFace
    try:
        model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2",
                                            cache_dir="/tmp/", local_files_only=True)
    except:
        model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2",
                                            cache_dir="/tmp/", local_files_only=False)

    model.max_seq_length = max_length

    train_dataset = sample_dataset(HFDataset.from_pandas(X_train[["text", "label"]]),
                                   label_column="label", num_samples=num_shots, seed=seed)

    # Create trainer
    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=HFDataset.from_pandas(X_val[["text", "label"]]),
        loss_class=CosineSimilarityLoss,
        #metric="accuracy",
        metric=compute_metrics,
        batch_size=16,
        num_iterations=20,  # The number of text pairs to generate for contrastive learning
        num_epochs=1,  # The number of epochs to use for contrastive learning
        column_mapping={"text": "text", "label": "label"},  # Map dataset columns to text/label expected by trainer
    )

    # Train and evaluate
    trainer.train()
    metrics = trainer.evaluate()
    print(metrics)

    acc, rec, prec, f1 = metrics

    results.append((acc, rec, prec, f1))

    print(f"Accuracy on validation: {acc}")

    print(100 * "==")

39794 39794
Dataset: full; 0: 0.54; 1: 0.46
Fold 0:
Dataset: train; 0: 0.54; 1: 0.46
Dataset: val; 0: 0.54; 1: 0.46


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


  0%|          | 0/32 [00:00<?, ?ba/s]

  0%|          | 0/32 [00:00<?, ?ba/s]

Applying column mapping to training dataset
***** Running training *****
  Num examples = 1600
  Num epochs = 1
  Total optimization steps = 100
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


(0.4900113079532605, 0.5606476399560922, 0.4538991335258831, 0.5016574585635359)
Accuracy on validation: 0.4900113079532605
Fold 1:
Dataset: train; 0: 0.54; 1: 0.46
Dataset: val; 0: 0.54; 1: 0.46


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


  0%|          | 0/32 [00:00<?, ?ba/s]

  0%|          | 0/32 [00:00<?, ?ba/s]

Applying column mapping to training dataset
***** Running training *****
  Num examples = 1600
  Num epochs = 1
  Total optimization steps = 100
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


(0.5003141098127906, 0.4777716794731065, 0.4563564875491481, 0.4668186083925459)
Accuracy on validation: 0.5003141098127906
Fold 2:
Dataset: train; 0: 0.54; 1: 0.46
Dataset: val; 0: 0.54; 1: 0.46


  0%|          | 0/32 [00:00<?, ?ba/s]

  0%|          | 0/32 [00:00<?, ?ba/s]

Applying column mapping to training dataset
***** Running training *****
  Num examples = 1600
  Num epochs = 1
  Total optimization steps = 100
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


(0.5048372911169745, 0.4964324917672887, 0.46206896551724136, 0.4786347400449795)
Accuracy on validation: 0.5048372911169745
Fold 3:
Dataset: train; 0: 0.54; 1: 0.46
Dataset: val; 0: 0.54; 1: 0.46


  0%|          | 0/32 [00:00<?, ?ba/s]

  0%|          | 0/32 [00:00<?, ?ba/s]

Applying column mapping to training dataset
***** Running training *****
  Num examples = 1600
  Num epochs = 1
  Total optimization steps = 100
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


(0.5059680864430205, 0.5148188803512623, 0.4643564356435644, 0.48828735033836546)
Accuracy on validation: 0.5059680864430205
Fold 4:
Dataset: train; 0: 0.54; 1: 0.46
Dataset: val; 0: 0.54; 1: 0.46


  0%|          | 0/32 [00:00<?, ?ba/s]

  0%|          | 0/32 [00:00<?, ?ba/s]

Applying column mapping to training dataset
***** Running training *****
  Num examples = 1600
  Num epochs = 1
  Total optimization steps = 100
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


(0.5095501382256848, 0.4260225089212188, 0.46135552913198574, 0.4429855858427287)
Accuracy on validation: 0.5095501382256848


### Saving the experiment results

In [7]:
# Save the results in a dataframe
results_dic = {"dataset": dataset_id, "acc": [i[0] for i in results], "rec": [i[1] for i in results],
               "prec": [i[2] for i in results], "f1": [i[3] for i in results], "method": f"FewShot({num_shots})",
               "fold": [i for i in range(1, 6)]}

results_df = pd.DataFrame(results_dic)
results_df

Unnamed: 0,dataset,acc,rec,prec,f1,method,fold
0,cs,0.490011,0.560648,0.453899,0.501657,FewShot(20),1
1,cs,0.500314,0.477772,0.456356,0.466819,FewShot(20),2
2,cs,0.504837,0.496432,0.462069,0.478635,FewShot(20),3
3,cs,0.505968,0.514819,0.464356,0.488287,FewShot(20),4
4,cs,0.50955,0.426023,0.461356,0.442986,FewShot(20),5


In [8]:
# Save the results into the clipboard
results_df.transpose().to_clipboard()