In [10]:
import pandas as pd
import numpy as np

In [11]:
from Config.config import CONFIG
CONFIG = CONFIG("Reddit")

In [12]:
from DyGLib.utils.DataLoader import get_link_prediction_data

In [13]:
node_raw_features, edge_raw_features, full_data, train_data, val_data, test_data = get_link_prediction_data(
                            val_ratio=CONFIG.train.val_ratio, 
                            test_ratio=CONFIG.train.test_ratio, 
                            node_dim=CONFIG.model.node_dim)


graph_df_train = pd.DataFrame({"u": train_data.src_node_ids, "i":train_data.dst_node_ids})
graph_df_val = pd.DataFrame({"u": val_data.src_node_ids, "i":val_data.dst_node_ids})
graph_df_test = pd.DataFrame({"u": test_data.src_node_ids, "i":test_data.dst_node_ids})
graph_df = full_data.dataset if full_data.dataset is not None else pd.DataFrame()

The dataset has 672447 interactions, involving 10984 different nodes
The training dataset has 389989 interactions, involving 9574 different nodes
The validation dataset has 100867 interactions, involving 9839 different nodes
The test dataset has 100867 interactions, involving 9615 different nodes
The new node validation dataset has 19446 interactions, involving 3491 different nodes
The new node test dataset has 21470 interactions, involving 3515 different nodes
1098 nodes were used for the inductive testing, i.e. are never seen during training


In [14]:
nodes_train = np.unique(np.concat([graph_df_train.u.to_numpy(), graph_df_train.u.to_numpy()]))
nodes_val = np.unique(np.concat([graph_df_val.u.to_numpy(), graph_df_val.u.to_numpy()]))
nodes_test = np.unique(np.concat([graph_df_test.u.to_numpy(), graph_df_test.u.to_numpy()]))
nodes_total = np.unique(np.concat([graph_df.u.to_numpy(), graph_df.u.to_numpy()]))

In [15]:
print(f"Num. nodes (train): {nodes_train.shape[0]}")
print(f"Num. nodes (val): {nodes_val.shape[0]}")
print(f"Num. nodes (test): {nodes_test.shape[0]}")
print(f"Num. nodes (total): {nodes_total.shape[0]}")

Num. nodes (train): 8706
Num. nodes (val): 8879
Num. nodes (test): 8656
Num. nodes (total): 10000


In [16]:
print(f"Num. edges (train): {graph_df_train.shape[0]}")
print(f"Num. edges (val): {graph_df_val.shape[0]}")
print(f"Num. edges (test): {graph_df_test.shape[0]}")
print(f"Num. edges (total): {graph_df.shape[0]}")

Num. edges (train): 389989
Num. edges (val): 100867
Num. edges (test): 100867
Num. edges (total): 672447


In [17]:
avg_deg_train = graph_df_train[['u', 'i']].groupby('i').count().mean().values[0]
avg_deg_val = pd.concat([graph_df_train, graph_df_val])[['u', 'i']].groupby('i').count().mean().values[0]
avg_deg_test = pd.concat([graph_df_train, graph_df_val, graph_df_test])[['u', 'i']].groupby('i').count().mean().values[0]
avg_deg_total = graph_df[['u', 'i']].groupby('i').count().mean().values[0]

print(f"Avg. degree (train): {avg_deg_train}")
print(f"Avg. degree (val): {avg_deg_val}")
print(f"Avg. degree (test): {avg_deg_test}")
print(f"Avg. degree (total): {avg_deg_total}")

Avg. degree (train): 449.29608294930875
Avg. degree (val): 501.38508682328904
Avg. degree (test): 601.344512195122
Avg. degree (total): 683.3810975609756


In [18]:
text = f"""
\\parbox[t]{{2mm}}{{\multirow{{4}}{{*}}{{\\rotatebox[origin=c]{{90}}{{Reddit}}}}}} & Train & {nodes_train.shape[0]} & {graph_df_train.shape[0]} & {round(avg_deg_train,3)} \\\\
& Validation & {nodes_val.shape[0]} & {graph_df_val.shape[0]} & {round(avg_deg_val,3)} \\\\
& Test & {nodes_test.shape[0]} & {graph_df_test.shape[0]} & {round(avg_deg_test,3)} \\\\
& Total & {nodes_total.shape[0]} & {graph_df.shape[0]} & {round(avg_deg_total,3)} \\\\
\\midrule
"""

print(text)


\parbox[t]{2mm}{\multirow{4}{*}{\rotatebox[origin=c]{90}{Reddit}}} & Train & 8706 & 389989 & 449.296 \\
& Validation & 8879 & 100867 & 501.385 \\
& Test & 8656 & 100867 & 601.345 \\
& Total & 10000 & 672447 & 683.381 \\
\midrule

