In [1]:
import pandas as pd
import numpy as np

In [2]:
from Config.config import CONFIG
CONFIG = CONFIG("MOOC")

In [3]:
from DyGLib.utils.DataLoader import get_link_prediction_data

In [4]:
node_raw_features, edge_raw_features, full_data, train_data, val_data, test_data = get_link_prediction_data(
                            val_ratio=CONFIG.train.val_ratio, 
                            test_ratio=CONFIG.train.test_ratio, 
                            node_dim=CONFIG.model.node_dim)


graph_df_train = pd.DataFrame({"u": train_data.src_node_ids, "i":train_data.dst_node_ids})
graph_df_val = pd.DataFrame({"u": val_data.src_node_ids, "i":val_data.dst_node_ids})
graph_df_test = pd.DataFrame({"u": test_data.src_node_ids, "i":test_data.dst_node_ids})
graph_df = full_data.dataset if full_data.dataset is not None else pd.DataFrame()

The dataset has 411749 interactions, involving 7144 different nodes
The training dataset has 227485 interactions, involving 6015 different nodes
The validation dataset has 61762 interactions, involving 2599 different nodes
The test dataset has 61763 interactions, involving 2412 different nodes
The new node validation dataset has 25592 interactions, involving 2333 different nodes
The new node test dataset has 29179 interactions, involving 2181 different nodes
714 nodes were used for the inductive testing, i.e. are never seen during training


In [5]:
nodes_train = np.unique(np.concat([graph_df_train.u.to_numpy(), graph_df_train.u.to_numpy()]))
nodes_val = np.unique(np.concat([graph_df_val.u.to_numpy(), graph_df_val.u.to_numpy()]))
nodes_test = np.unique(np.concat([graph_df_test.u.to_numpy(), graph_df_test.u.to_numpy()]))
nodes_total = np.unique(np.concat([graph_df.u.to_numpy(), graph_df.u.to_numpy()]))

In [6]:
print(f"Num. nodes (train): {nodes_train.shape[0]}")
print(f"Num. nodes (val): {nodes_val.shape[0]}")
print(f"Num. nodes (test): {nodes_test.shape[0]}")
print(f"Num. nodes (total): {nodes_total.shape[0]}")

Num. nodes (train): 5952
Num. nodes (val): 2514
Num. nodes (test): 2315
Num. nodes (total): 7047


In [7]:
print(f"Num. edges (train): {graph_df_train.shape[0]}")
print(f"Num. edges (val): {graph_df_val.shape[0]}")
print(f"Num. edges (test): {graph_df_test.shape[0]}")
print(f"Num. edges (total): {graph_df.shape[0]}")

Num. edges (train): 227485
Num. edges (val): 61762
Num. edges (test): 61763
Num. edges (total): 411749


In [8]:
avg_deg_train = graph_df_train[['u', 'i']].groupby('i').count().mean().values[0]
avg_deg_val = pd.concat([graph_df_train, graph_df_val])[['u', 'i']].groupby('i').count().mean().values[0]
avg_deg_test = pd.concat([graph_df_train, graph_df_val, graph_df_test])[['u', 'i']].groupby('i').count().mean().values[0]
avg_deg_total = graph_df[['u', 'i']].groupby('i').count().mean().values[0]

print(f"Avg. degree (train): {avg_deg_train}")
print(f"Avg. degree (val): {avg_deg_val}")
print(f"Avg. degree (test): {avg_deg_test}")
print(f"Avg. degree (total): {avg_deg_total}")

Avg. degree (train): 3610.873015873016
Avg. degree (val): 3402.9058823529413
Avg. degree (test): 3618.659793814433
Avg. degree (total): 4244.835051546392


In [9]:
text = f"""
\\parbox[t]{{2mm}}{{\multirow{{4}}{{*}}{{\\rotatebox[origin=c]{{90}}{{MOOC}}}}}} & Train & {nodes_train.shape[0]} & {graph_df_train.shape[0]} & {round(avg_deg_train,3)} \\\\
& Validation & {nodes_val.shape[0]} & {graph_df_val.shape[0]} & {round(avg_deg_val,3)} \\\\
& Test & {nodes_test.shape[0]} & {graph_df_test.shape[0]} & {round(avg_deg_test,3)} \\\\
& Total & {nodes_total.shape[0]} & {graph_df.shape[0]} & {round(avg_deg_total,3)} \\\\
\\midrule
"""

print(text)


\parbox[t]{2mm}{\multirow{4}{*}{\rotatebox[origin=c]{90}{MOOC}}} & Train & 5952 & 227485 & 3610.873 \\
& Validation & 2514 & 61762 & 3402.906 \\
& Test & 2315 & 61763 & 3618.66 \\
& Total & 7047 & 411749 & 4244.835 \\
\midrule

