In [36]:
import numpy as np
import scipy.sparse as ssp
from tqdm import tqdm

In [37]:
data_path = "/Users/jl102430/Documents/study/anomaly_detection/data/dynamic/DGraph/DGraphFin/dgraphfin.npz"

output_prefix = "../dataset/dgl_format_1"
# %%
data = np.load(data_path)
data

# %%
X = data["x"]
y = data["y"]

edge_index = data["edge_index"]
edge_type = data["edge_type"]
edge_timestamp = data["edge_timestamp"]

train_mask = data["train_mask"]
valid_mask = data["valid_mask"]
test_mask = data["test_mask"]


print(
    f"""
X shape: {X.shape},
y shape: {y.shape}

edge_index shape: {edge_index.shape}
edge_type shape: {edge_type.shape}
edge_timestamp shape: {edge_timestamp.shape}

train_mask shape: {train_mask.shape}
valid_mask shape: {valid_mask.shape}
test_mask shape: {test_mask.shape}
"""
)



X shape: (3700550, 17),
y shape: (3700550,)

edge_index shape: (4300999, 2)
edge_type shape: (4300999,)
edge_timestamp shape: (4300999,)

train_mask shape: (857899,)
valid_mask shape: (183862,)
test_mask shape: (183840,)



In [42]:
def reindex_graph_dataset(
    edge_index, node_feature, node_label, edge_type, edge_timestamp, mask, name
):
    masked_edge_index = edge_index[mask]
    masked_edge_type = edge_type[mask]
    masked_edge_timestamp = edge_timestamp[mask] - 1

    sorted_index = np.argsort(masked_edge_timestamp)
    masked_edge_index = masked_edge_index[sorted_index]
    masked_edge_type = masked_edge_type[sorted_index]
    masked_edge_timestamp = masked_edge_timestamp[sorted_index]

    node_list = np.unique(masked_edge_index.flatten())

    reindex_edge_index = np.empty_like(masked_edge_index)
    node2id = {n: i for i, n in enumerate(node_list)}

    reindex_edge_index[:, 0] = np.array(
        list(map(lambda x: node2id[x], masked_edge_index[:, 0]))
    )
    reindex_edge_index[:, 1] = np.array(
        list(map(lambda x: node2id[x], masked_edge_index[:, 1]))
    )

    masked_node_feature = node_feature[node_list]
    masked_node_label = node_label[node_list]

    net = []
    for ts in tqdm(np.unique(masked_edge_timestamp)):
        ts_mask = masked_edge_timestamp == ts
        net_edge_index = reindex_edge_index[ts_mask]
        net_edge_type = masked_edge_type[ts_mask]

        _net = ssp.csc_matrix(
            (
                np.ones(net_edge_index.shape[0]),
                (net_edge_index[:, 0], net_edge_index[:, 1]),
            ),
            shape=(masked_node_feature.shape[0], masked_node_feature.shape[0]),
        )
        net.append(_net)
    net = np.array(net)

    print(
        f"""
          reindex_edge_index: {reindex_edge_index.shape}
          masked_edge_type: {masked_edge_type.shape}
          masked_edge_timestamp: {masked_edge_timestamp.shape}
          unique nodes: {np.unique(reindex_edge_index.flatten()).shape}
          masked_node_feature: {masked_node_feature.shape}
          masked_node_label: {masked_node_label.shape}
          net: {net.shape}
          """
    )

    num_nodes = masked_node_feature.shape[0]
    num_edges = reindex_edge_index.shape[0]
    with open(f"dgrah_processed_{name}_data.txt", "w") as f:
        f.write("% asym positive\n")
        f.write(f"% {num_edges} {num_nodes} {num_nodes}\n")
        for (src, dst), ts in tqdm(zip(reindex_edge_index, masked_edge_timestamp)):
            f.write(f"{src} {dst} 1 {ts}\n")

    return (
        reindex_edge_index,
        masked_node_feature,
        masked_node_label,
        masked_edge_type,
        masked_edge_timestamp,
        net,
        node2id,
    )

In [43]:
(
    train_edge_index, 
    train_node_feature,
    train_node_label,
    train_edge_type,
    train_edge_timestamp,
    train_net,
    train_node2id,
) = reindex_graph_dataset(
    edge_index, X, y, edge_type, edge_timestamp, train_mask, "train"
)

100%|██████████| 821/821 [00:16<00:00, 48.95it/s]



          reindex_edge_index: (857899, 2)
          masked_edge_type: (857899,)
          masked_edge_timestamp: (857899,)
          unique nodes: (1367190,)
          masked_node_feature: (1367190, 17)
          masked_node_label: (1367190,)
          net: (821,)
          


857899it [00:03, 230401.49it/s]


In [44]:
(
    valid_edge_index, 
    valid_node_feature,
    valid_node_label,
    valid_edge_type,
    valid_edge_timestamp,
    valid_net,
    valid_node2id,
) = reindex_graph_dataset(
    edge_index, X, y, edge_type, edge_timestamp, valid_mask, "valid"
)

100%|██████████| 821/821 [00:03<00:00, 251.90it/s]



          reindex_edge_index: (183862, 2)
          masked_edge_type: (183862,)
          masked_edge_timestamp: (183862,)
          unique nodes: (348259,)
          masked_node_feature: (348259, 17)
          masked_node_label: (348259,)
          net: (821,)
          


183862it [00:00, 241241.29it/s]
