In [106]:
import numpy as np
import scipy.sparse as ssp
from tqdm import tqdm

In [3]:
data_path = "../detection/data/acc_email.npy"

In [25]:
net = np.load(data_path, allow_pickle=True)
net.shape

(20,)

In [27]:
net[0]

<2029x2029 sparse matrix of type '<class 'numpy.float64'>'
	with 416 stored elements in Compressed Sparse Row format>

In [21]:
f = np.load("../detection/data/email0.01.npz")

train_pos_id, train_neg_id, test_pos_id, test_neg_id = (
    f["train_pos_id"],
    f["train_neg_id"],
    f["test_pos_id"],
    f["test_neg_id"],
)

train_pos, train_neg, test_pos, test_neg = (
    f["train_pos"],
    f["train_neg"],
    f["test_pos"],
    f["test_neg"],
)

In [19]:
train_pos_id.shape, train_neg_id.shape, test_pos_id.shape, test_neg_id.shape

((1546,), (1546,), (1227,), (10,))

In [24]:
train_pos.shape, train_neg.shape, test_pos.shape, test_neg.shape

((2, 1546), (2, 1546), (2, 1227), (2, 10))

In [29]:
train_pos

array([[ 330,  998,  477, ..., 1679, 1679, 1585],
       [ 546, 1257, 1668, ..., 1684, 1791, 1679]])

In [30]:
train_pos[0]

array([ 330,  998,  477, ..., 1679, 1679, 1585])

In [31]:
train_pos[1]

array([ 546, 1257, 1668, ..., 1684, 1791, 1679])

In [35]:
np.unique(train_pos_id), np.unique(test_pos_id)

(array([ 5,  6,  7,  8,  9, 10, 11, 12, 13]), array([15, 16, 17, 18, 19]))

In [37]:
np.unique(train_neg_id), np.unique(test_neg_id)

(array([ 5,  6,  7,  8,  9, 10, 11, 12, 13]), array([15, 16, 17, 18, 19]))

# DGraph Preprocessing

In [94]:
window_size = 5

In [40]:
dgraph_data = np.load("/Users/jl102430/Documents/study/anomaly_detection/data/dynamic/DGraph/DGraphFin/dgraphfin.npz")

X = dgraph_data["x"]
y = dgraph_data["y"]

edge_index = dgraph_data["edge_index"]
edge_type = dgraph_data["edge_type"]
edge_timestamp = dgraph_data["edge_timestamp"]

train_mask = dgraph_data["train_mask"]
valid_mask = dgraph_data["valid_mask"]
test_mask = dgraph_data["test_mask"]

print(
    f"""
X shape: {X.shape},
y shape: {y.shape}

edge_index shape: {edge_index.shape}
edge_type shape: {edge_type.shape}
edge_timestamp shape: {edge_timestamp.shape}

train_mask shape: {train_mask.shape}
valid_mask shape: {valid_mask.shape}
test_mask shape: {test_mask.shape}
"""
)


X shape: (3700550, 17),
y shape: (3700550,)

edge_index shape: (4300999, 2)
edge_type shape: (4300999,)
edge_timestamp shape: (4300999,)

train_mask shape: (857899,)
valid_mask shape: (183862,)
test_mask shape: (183840,)



In [140]:
def reindex_graph_dataset(
    edge_index, node_feature, node_label, edge_type, edge_timestamp, mask
):
    masked_edge_index = edge_index[mask]
    masked_edge_type = edge_type[mask]
    masked_edge_timestamp = edge_timestamp[mask] - 1

    sorted_index = np.argsort(masked_edge_timestamp)
    masked_edge_index = masked_edge_index[sorted_index]
    masked_edge_type = masked_edge_type[sorted_index]
    masked_edge_timestamp = masked_edge_timestamp[sorted_index]

    node_list = np.unique(masked_edge_index.flatten())

    reindex_edge_index = np.empty_like(masked_edge_index)
    node2id = {n: i for i, n in enumerate(node_list)}

    reindex_edge_index[:, 0] = np.array(
        list(map(lambda x: node2id[x], masked_edge_index[:, 0]))
    )
    reindex_edge_index[:, 1] = np.array(
        list(map(lambda x: node2id[x], masked_edge_index[:, 1]))
    )

    masked_node_feature = node_feature[node_list]
    masked_node_label = node_label[node_list]

    net = []
    for ts in tqdm(np.unique(masked_edge_timestamp)):
        ts_mask = masked_edge_timestamp == ts
        net_edge_index = reindex_edge_index[ts_mask]
        net_edge_type = masked_edge_type[ts_mask]

        _net = ssp.csc_matrix(
            (
                np.ones(net_edge_index.shape[0]),
                (net_edge_index[:, 0], net_edge_index[:, 1]),
            ),
            shape=(masked_node_feature.shape[0], masked_node_feature.shape[0]),
        )
        net.append(_net)
    net = np.array(net)

    print(
        f"""
          reindex_edge_index: {reindex_edge_index.shape}
          masked_edge_type: {masked_edge_type.shape}
          masked_edge_timestamp: {masked_edge_timestamp.shape}
          unique nodes: {np.unique(reindex_edge_index.flatten()).shape}
          masked_node_feature: {masked_node_feature.shape}
          masked_node_label: {masked_node_label.shape}
          net: {net.shape}
          """
    )

    return (
        reindex_edge_index,
        masked_node_feature,
        masked_node_label,
        masked_edge_type,
        masked_edge_timestamp,
        net,
        node2id,
    )

In [141]:
(
    train_edge_index,  #train_pos
    train_node_feature,
    train_node_label,
    train_edge_type,
    train_edge_timestamp, # train_pos_id
    train_net,
    train_node2id,
) = reindex_graph_dataset(edge_index, X, y, edge_type, edge_timestamp, train_mask)

100%|██████████| 821/821 [00:16<00:00, 48.66it/s]



          reindex_edge_index: (857899, 2)
          masked_edge_type: (857899,)
          masked_edge_timestamp: (857899,)
          unique nodes: (1367190,)
          masked_node_feature: (1367190, 17)
          masked_node_label: (1367190,)
          net: (821,)
          


In [159]:
import random
def sample_neg(net, pos):
    def sample_neg_ts(_net, pos):
        neg = []
        num_node = _net.shape[0]
        pbar = tqdm(total=100)
        while len(neg) < pos.shape[0]:
            i = random.randint(0, num_node-1)
            j = random.randint(i, num_node-1)
            if i < j and _net[i, j] == 0.0:
                neg.append([i, j])
                # print(len(neg) / pos.shape[0])
        pbar.close()
        return neg

    negs = []
    for ts in range(net.shape[0]):
        # print(f'Timestamp {ts}')
        negs.append(sample_neg_ts(net[ts], pos))
    
    return np.concatenate(negs, axis=1)
        

sample_neg(train_net, train_edge_index)

  0%|          | 0/100 [00:00<?, ?it/s]

857899


  0%|          | 0/100 [02:56<?, ?it/s]


In [143]:
train_edge_timestamp

array([  0,   0,   0, ..., 820, 820, 820])

In [145]:
train_edge_index, train_net[0][621549, 811894]

(array([[ 651984,  487447],
        [ 641818,  553387],
        [ 621549,  811894],
        ...,
        [ 139479,  333886],
        [1362734, 1333421],
        [1361817,  105163]]),
 1.0)

In [115]:
# for ts in range(821):
#     if train_net[ts][175553, 375980] == 1.0:
#         print(ts)
#         break
# train_edge_index[0], train_net[ts][175553, 375980]

108


(array([175553, 375980]), 1.0)