In [22]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px
import json

import torch
from torch_geometric.utils import dense_to_sparse, to_dense_adj
import dgl
from dgl import save_graphs, load_graphs


# Load Data

In [23]:
data_path = "/Users/jl102430/Documents/study/anomaly_detection/data/dynamic/DGraph/DGraphFin/dgraphfin.npz"


In [24]:
data = np.load(data_path)
data

<numpy.lib.npyio.NpzFile at 0x7fdafdcbfa50>

In [25]:
X = data['x']
y = data['y']

edge_index = data['edge_index']
edge_type = data['edge_type']
edge_timestamp = data['edge_timestamp']

train_mask = data['train_mask']
valid_mask = data['valid_mask']
test_mask = data['test_mask']


print(f"""
X shape: {X.shape},
y shape: {y.shape}

edge_index shape: {edge_index.shape}
edge_type shape: {edge_type.shape}
edge_timestamp shape: {edge_timestamp.shape}

train_mask shape: {train_mask.shape}
valid_mask shape: {valid_mask.shape}
test_mask shape: {test_mask.shape}
""")

KeyboardInterrupt: 

In [None]:
edge_index = pd.DataFrame(edge_index, columns=[f"src_id", "dst_id"])
edge_index["edge_type"] = edge_type
edge_index["edge_timestamp"] = edge_timestamp

edge_index = edge_index.sort_values("edge_timestamp")

edge_index

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp
2556952,2229506,1955006,9,1
2094731,1260099,2257207,9,1
35206,2014056,164472,10,1
3160061,1354445,1295739,9,1
3785647,1711413,599839,10,1
...,...,...,...,...
265866,2685080,3113012,4,821
3290883,3018259,3683536,4,821
1895847,697121,311084,5,821
3688352,2117711,3684148,6,821


In [None]:
def resolve_node_type(df):  # update df in-place
    node_type_feat_idx = 0
    _type_map = {t: i for i, t in enumerate(df[f'feat_{node_type_feat_idx}'].unique())}
    df['node_type'] = df[f'feat_{node_type_feat_idx}'].apply(lambda x: _type_map[x])
    return _type_map

node_feature = pd.DataFrame(X, columns=[f"feat_{i}" for i in range(X.shape[1])])
node_feature['y'] = y
node_feature = node_feature.reset_index().rename(columns={'index': 'node_id'})
node_type_map = resolve_node_type(node_feature)

node_feature

Unnamed: 0,node_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y,node_type
0,0,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,0
1,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3,1
2,2,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,0
3,3,1.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3,2
4,4,1.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3700545,3700545,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,1
3700546,3700546,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,1
3700547,3700547,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,1
3700548,3700548,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,1


In [None]:
node_feature[node_feature['node_id'] == 3683606]

Unnamed: 0,node_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y,node_type
3683606,3683606,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,1


In [None]:
full_data = edge_index.merge(
    node_feature[["node_id", "node_type"]], left_on=["src_id"], right_on=["node_id"]
).rename(columns={"node_type": "src_type"}).merge(
    node_feature[["node_id", "node_type"]], left_on=["dst_id"], right_on=["node_id"]
).rename(columns={"node_type": "dst_type"}).drop(['node_id_x', 'node_id_y'], axis=1)

full_data

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp,src_type,dst_type
0,2229506,1955006,9,1,1,0
1,2106252,1955006,9,11,2,0
2,159746,1955006,3,571,0,0
3,82861,1955006,5,654,0,0
4,3596335,1955006,9,752,2,0
...,...,...,...,...,...,...
4300994,241780,3683606,1,821,2,1
4300995,3227415,3684274,5,821,0,2
4300996,3684051,673732,4,821,2,2
4300997,3673288,967661,4,821,2,2


In [None]:
(full_data[full_data.index.isin(train_mask)].groupby(["edge_timestamp", "src_type", "edge_type", "dst_type"])[
    ["src_id", "dst_id"]
].count()
# .agg({
#     "src_id": lambda x: list(x),
#     "dst_id": lambda x: list(x)
# })
.reset_index())

Unnamed: 0,edge_timestamp,src_type,edge_type,dst_type,src_id,dst_id
0,1,0,9,0,30,30
1,1,0,9,1,19,19
2,1,0,9,2,93,93
3,1,0,10,0,63,63
4,1,0,10,1,42,42
...,...,...,...,...,...,...
46700,821,2,9,1,8,8
46701,821,2,10,0,2,2
46702,821,2,10,1,12,12
46703,821,2,10,2,3,3


In [None]:
full_data

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp,src_type,dst_type
0,2229506,1955006,9,1,1,0
1,2106252,1955006,9,11,2,0
2,159746,1955006,3,571,0,0
3,82861,1955006,5,654,0,0
4,3596335,1955006,9,752,2,0
...,...,...,...,...,...,...
4300994,241780,3683606,1,821,2,1
4300995,3227415,3684274,5,821,0,2
4300996,3684051,673732,4,821,2,2
4300997,3673288,967661,4,821,2,2


In [None]:
(full_data[full_data.index.isin(train_mask)].groupby(["edge_timestamp"
                                                      #, "src_type", "edge_type", "dst_type"
                                                      ])[
    ["src_id", "dst_id"]
].max()
# .agg({
#     "src_id": lambda x: list(x),
#     "dst_id": lambda x: list(x)
# })
.reset_index())

Unnamed: 0,edge_timestamp,src_id,dst_id
0,1,2337848,3699211
1,2,2338882,3700440
2,3,2337815,3699731
3,4,2338104,3696691
4,5,2338938,3700456
...,...,...,...
816,817,3679945,3686071
817,818,3681149,3686174
818,819,3682250,3697299
819,820,3683454,3686161


In [None]:
tmp = full_data[full_data.index.isin(train_mask)].groupby(
    'edge_timestamp'
).agg({
    "src_id": lambda x: list(x),
    "dst_id": lambda x: list(x)
}).reset_index()

tmp['node_list'] = tmp['src_id'] + tmp['dst_id']
tmp['node_list'].apply(lambda x: len(set(x)))

0      2654
1      2739
2      2841
3      2811
4      2744
       ... 
816    1146
817    1227
818    1087
819    1147
820    1231
Name: node_list, Length: 821, dtype: int64

In [None]:
tmp = (
    full_data.groupby(["edge_timestamp", "src_type", "edge_type", "dst_type"])[
        ["src_id", "dst_id"]
    ]
    .agg({"src_id": lambda x: list(x), "dst_id": lambda x: list(x)})
    .reset_index()
    .sort_values("edge_timestamp")
)


In [None]:
def resolve_node_list(df):
    _all_nodes = []
    cnt = 0
    for i in tqdm((df["src_id"] + df["dst_id"])):
        _all_nodes = _all_nodes + i
        if cnt % 10000 == 0:
            _all_nodes = list(set(_all_nodes))
        cnt += 1
    return set(_all_nodes)


def resolve_node_mapping_by_types(_node_feature):
    _mapping = {}
    for ntype in _node_feature["node_type"].unique():
        index2node = (
            _node_feature[_node_feature.node_type == ntype]
            .reset_index(drop=True)["node_id"]
            .to_dict()
        )
        node2index = {v: k for k, v in index2node.items()}
        _mapping[ntype] = node2index
    return _mapping


def apply_node_reindex_by_map(_map):
    def apply_reindex(_type, _node_id):
        return _map[_type][_node_id]

    return apply_reindex


def create_dgl_graph(graph_data_dict, num_nodes_dict, node_features):
    g = dgl.heterograph(graph_data_dict, num_nodes_dict=num_nodes_dict)

    for ntype in node_features["node_type"].unique():
        g.nodes[f"v_{ntype}"].data["features"] = torch.tensor(
            node_features[node_features["node_type"] == ntype][
                [f"feat_{i}" for i in range(1, 17)]
            ].values
        )
    return g


def resolve_lables_by_types(node_features):
    node_labels = {}
    for ntype in node_features["node_type"].unique():
        node_labels[f"v_{ntype}"] = torch.tensor(
            node_features[node_features["node_type"] == ntype]["y"].values
        )
    return node_labels

In [None]:
def construct_dgl_dataset(mask, name, save=False):
    """
    Train/Val/Test needs to be re-indexed
    """
    if mask is not None:
        _data = full_data[full_data.index.isin(mask)]
    else:
        _data = full_data

    _tmp_data = (
        _data.groupby(["edge_timestamp"])[["src_id", "dst_id"]]
        .agg({"src_id": lambda x: list(x), "dst_id": lambda x: list(x)})
        .reset_index()
        .sort_values("edge_timestamp")
    )

    print("Reindex all the nodes..")
    node_list = resolve_node_list(_tmp_data)
    _node_feature = node_feature[node_feature["node_id"].isin(node_list)]

    node2id = resolve_node_mapping_by_types(_node_feature)
    node2id_apply = apply_node_reindex_by_map(node2id)

    # node_idx_map = {nid: i for i, nid in enumerate(node_list)}+

    _data["src_id"] = _data.apply(
        lambda x: node2id_apply(x["src_type"], x["src_id"]), axis=1
    )
    _data["dst_id"] = _data.apply(
        lambda x: node2id_apply(x["dst_type"], x["dst_id"]), axis=1
    )
    _node_feature["node_id"] = _node_feature.apply(
        lambda x: node2id_apply(x["node_type"], x["node_id"]), axis=1
    )

    # resolve labels
    node_labels = resolve_lables_by_types(_node_feature)

    print("Agg..")
    graph_data = (
        _data.groupby(["edge_timestamp", "src_type", "edge_type", "dst_type"])[
            ["src_id", "dst_id"]
        ]
        .agg({"src_id": lambda x: list(x), "dst_id": lambda x: list(x)})
        .reset_index()
        .sort_values("edge_timestamp")
    )
    num_nodes_dict = {}
    for _i, (_t, _n) in (
        _node_feature.groupby("node_type")[["node_id"]].count().reset_index().iterrows()
    ):
        num_nodes_dict[f"v_{_t}"] = _n

    g_list = []
    graph_data_dict = {}
    current_ts = -1
    for idx, (
        edge_timestamp,
        src_type,
        edge_type,
        dst_type,
        src_list,
        dst_list,
    ) in tqdm(graph_data.iterrows()):
        # Start a new graph construction
        if (edge_timestamp > current_ts) and (current_ts != -1):
            g = create_dgl_graph(
                graph_data_dict,
                num_nodes_dict=num_nodes_dict,
                node_features=_node_feature,
            )
            g_list.append(g)
            graph_data_dict = {}

        graph_data_dict[(f"v_{src_type}", f"e_{edge_type}", f"v_{dst_type}")] = (
            torch.tensor(src_list),
            torch.tensor(dst_list),
        )
        current_ts = edge_timestamp

    if len(graph_data_dict.keys()) > 0:
        g = create_dgl_graph(
            graph_data_dict, num_nodes_dict=num_nodes_dict, node_features=_node_feature
        )
        g_list.append(g)
    
    if save:
        output_prefix = "../dataset/dgl_format_1"
        print(f'Save to {output_prefix}')
        save_graphs(f"{output_prefix}/dgraph_{name}_dgl.bin", g_list, node_labels)
    return g_list, _data, _node_feature, graph_data_dict



In [None]:

train_graphs, train_data, train_feature, train_node_labels = construct_dgl_dataset(train_mask, name='train', save=True)

valid_graphs, valid_data, valid_feature, valid_node_labels = construct_dgl_dataset(valid_mask, name='valid', save=True)

test_graphs, test_data, test_feature, test_node_labels = construct_dgl_dataset(test_mask, name='test', save=True)
# len(train_graphs)
# len(train_graphs)

# len(train_graphs)

Reindex all the nodes..


100%|██████████| 821/821 [00:14<00:00, 55.20it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Agg..


81it [00:01, 48.81it/s]


Reindex all the nodes..


100%|██████████| 821/821 [00:02<00:00, 365.82it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Agg..


114it [00:00, 283.06it/s]


Reindex all the nodes..


100%|██████████| 821/821 [00:02<00:00, 317.26it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Agg..


88it [00:00, 317.01it/s]
