In [1]:
# %%
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px
import json

import torch
from torch_geometric.utils import dense_to_sparse, to_dense_adj
import dgl
from dgl import save_graphs, load_graphs

In [2]:
data_path = "/Users/jl102430/Documents/study/anomaly_detection/data/dynamic/DGraph/DGraphFin/dgraphfin.npz"

In [3]:
data = np.load(data_path)
data

# %%
X = data["x"]
y = data["y"]

edge_index = data["edge_index"]
edge_type = data["edge_type"]
edge_timestamp = data["edge_timestamp"]

train_mask = data["train_mask"]
valid_mask = data["valid_mask"]
test_mask = data["test_mask"]


print(
    f"""
X shape: {X.shape},
y shape: {y.shape}

edge_index shape: {edge_index.shape}
edge_type shape: {edge_type.shape}
edge_timestamp shape: {edge_timestamp.shape}

train_mask shape: {train_mask.shape}
valid_mask shape: {valid_mask.shape}
test_mask shape: {test_mask.shape}
"""
)


X shape: (3700550, 17),
y shape: (3700550,)

edge_index shape: (4300999, 2)
edge_type shape: (4300999,)
edge_timestamp shape: (4300999,)

train_mask shape: (857899,)
valid_mask shape: (183862,)
test_mask shape: (183840,)



In [4]:
edge_timestamp[edge_timestamp <= 7].shape

(32454,)

In [5]:
# train_X, train_y = X[train_mask], y[train_mask]
edge_index[train_mask].shape, edge_timestamp[train_mask].shape, edge_type[train_mask].shape

((857899, 2), (857899,), (857899,))

In [6]:
train_edge_index = pd.DataFrame(edge_index[train_mask], columns=['src_id', 'dst_id'])
train_edge_index['timestamp'] = edge_timestamp[train_mask]
train_edge_index['edge_type'] = edge_type[train_mask]
train_edge_index = train_edge_index.sort_values('timestamp').reset_index(drop=True)

train_edge_index

Unnamed: 0,src_id,dst_id,timestamp,edge_type
0,1810566,1361425,1,10
1,1783155,1544039,1,11
2,1728394,2239849,1,10
3,1886055,683274,1,10
4,2203323,773310,1,11
...,...,...,...,...
857894,3683404,3490494,821,2
857895,3683543,2721874,821,8
857896,394482,936197,821,5
857897,3683498,3595997,821,5


In [7]:
valid_edge_index = pd.DataFrame(edge_index[valid_mask], columns=['src_id', 'dst_id'])
valid_edge_index['timestamp'] = edge_timestamp[valid_mask]
valid_edge_index['edge_type'] = edge_type[valid_mask]
valid_edge_index = valid_edge_index.sort_values('timestamp').reset_index(drop=True)

valid_edge_index

Unnamed: 0,src_id,dst_id,timestamp,edge_type
0,1415795,543208,1,10
1,1736265,1592619,1,10
2,2195490,2276340,1,9
3,1884566,1879177,1,10
4,577700,24468,1,9
...,...,...,...,...
183857,185828,1669105,821,5
183858,3683854,3145323,821,5
183859,971252,3684448,821,4
183860,3684515,301434,821,5


In [8]:
test_edge_index = pd.DataFrame(edge_index[test_mask], columns=['src_id', 'dst_id'])
test_edge_index['timestamp'] = edge_timestamp[test_mask]
test_edge_index['edge_type'] = edge_type[test_mask]
test_edge_index = test_edge_index.sort_values('timestamp').reset_index(drop=True)

test_edge_index

Unnamed: 0,src_id,dst_id,timestamp,edge_type
0,682425,1496933,1,9
1,1911080,2199706,1,9
2,5388,1223207,1,9
3,666234,1265083,1,11
4,204916,400718,1,10
...,...,...,...,...
183835,3683908,408379,821,5
183836,3677127,914834,821,5
183837,2975154,1029181,821,5
183838,606050,2370665,821,4


In [49]:
freq_cnt = train_edge_index['src_id'].value_counts().reset_index().merge(
    train_edge_index['dst_id'].value_counts().reset_index(),
    on='index',
    how='outer'
).fillna(0)

freq_cnt['node_freq'] = freq_cnt.src_id + freq_cnt.dst_id

# px.bar(freq_cnt.groupby('node_freq')[['index']].count().reset_index(), x='node_freq', y='index').show()

freq_cnt.groupby('node_freq')[['index']].count().reset_index()

Unnamed: 0,node_freq,index
0,1.0,1086509
1,2.0,229368
2,3.0,41841
3,4.0,7200
4,5.0,1400
5,6.0,347
6,7.0,151
7,8.0,74
8,9.0,59
9,10.0,47


In [24]:
train_node_list = sorted(list(set(train_edge_index.src_id.values).union(
    set(train_edge_index.dst_id)
)))

In [10]:
# node may appear again in later timestamps, large gap
train_edge_index[train_edge_index['timestamp'] <= 6].groupby('src_id').count().reset_index().groupby('dst_id').count()

Unnamed: 0_level_0,src_id,timestamp,edge_type
dst_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5494,5494,5494
2,42,42,42


In [12]:
node_feature = pd.DataFrame(X, columns=[f'feat_{i}' for i in range(17)])
node_feature['y'] = y

node_feature = node_feature.reset_index()
node_feature

Unnamed: 0,index,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y
0,0,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
1,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3
2,2,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3,3,1.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3
4,4,1.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3700545,3700545,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3700546,3700546,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3700547,3700547,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3700548,3700548,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2


In [74]:
# train_node_feature = X[train_node_list]
# train_node_feature

# Select Node Type

In [None]:
# setting node type
node_type_map = {
    0: 'A',
    1: 'B',
    -1: 'C'
}
node_types = node_feature['feat_0'].apply(lambda x: node_type_map[int(x)] ).reset_index(name='node_type')


node_feature = node_feature.drop('feat_0', axis=1).merge(
    node_types,
    on='index',
    how='left'
)

node_types

Unnamed: 0,index,node_type
0,0,A
1,1,C
2,2,A
3,3,B
4,4,B
...,...,...
3700545,3700545,C
3700546,3700546,C
3700547,3700547,C
3700548,3700548,C


In [None]:
node_feature

Unnamed: 0,index,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y,node_type
0,0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,A
1,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3,C
2,2,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,A
3,3,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3,B
4,4,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3700545,3700545,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,C
3700546,3700546,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,C
3700547,3700547,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,C
3700548,3700548,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,C


In [None]:
node_feature.y.values

array([2, 3, 2, ..., 2, 2, 2])

# Observation: Majority of the node only appear 1 times, some 2 times, very few more than 3 times

In [67]:
train_node_feature = freq_cnt.merge(node_feature, on="index")

px.bar(
    train_node_feature[train_node_feature.y==0].groupby(["node_freq", "y"])[
        ["index"]
    ].count().reset_index(),
    x='node_freq',
    y='index',
    facet_col='y',
    text='node_freq'
).show()

px.bar(
    train_node_feature[train_node_feature.y==1].groupby(["node_freq", "y"])[
        ["index"]
    ].count().reset_index(),
    x='node_freq',
    y='index',
    facet_col='y'
).show()

px.bar(
    train_node_feature[train_node_feature.y==2].groupby(["node_freq", "y"])[
        ["index"]
    ].count().reset_index(),
    x='node_freq',
    y='index',
    facet_col='y'
).show()

px.bar(
    train_node_feature[train_node_feature.y==3].groupby(["node_freq", "y"])[
        ["index"]
    ].count().reset_index(),
    x='node_freq',
    y='index',
    facet_col='y'
).show()

# Observation: Node Features are static throughout all the time windows

In [69]:
train_node_feature

Unnamed: 0,index,src_id,dst_id,node_freq,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,...,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y,node_type
0,1260136,6.0,0.0,6.0,3.0,0.485,0.485,1.000000,0.9,0.913,...,1.000000,3.0,-1.000000,-1.000000,0.092784,0.555556,0.030928,0.333333,0,A
1,2953670,5.0,0.0,5.0,0.0,0.560,0.485,0.866071,0.5,0.873,...,0.970218,0.0,0.133929,0.029782,0.051546,1.200000,-1.000000,-1.000000,2,B
2,2615443,5.0,0.0,5.0,3.0,0.445,0.400,0.898876,1.3,1.325,...,0.993208,4.0,0.101124,0.006792,0.162500,1.000000,0.044944,0.307692,2,B
3,3065739,5.0,0.0,5.0,3.0,-1.000,-1.000,-1.000000,-1.0,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,3,B
4,515654,5.0,0.0,5.0,2.0,-1.000,-1.000,-1.000000,-1.0,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1367185,2308818,0.0,1.0,1.0,5.0,-1.000,-1.000,-1.000000,-1.0,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,2,B
1367186,1253132,0.0,1.0,1.0,2.0,0.295,0.280,0.949153,0.4,0.761,...,0.996058,2.0,0.050847,0.003942,0.071429,0.500000,0.033898,0.500000,3,A
1367187,1252054,0.0,1.0,1.0,3.0,-1.000,-1.000,-1.000000,-1.0,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0,B
1367188,2298583,0.0,1.0,1.0,5.0,-1.000,-1.000,-1.000000,-1.0,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,2,B


In [72]:
feature_cols = [x for x in node_feature.columns if 'feat' in x]
# feature_cols

**Quick test with running classification on node features directly**

In [83]:
from sklearn.linear_model import LogisticRegression

clf_X = train_node_feature[train_node_feature['y'].isin([0,1])][feature_cols].values
clf_y = train_node_feature[train_node_feature['y'].isin([0,1])]['y'].values

clf = LogisticRegression(random_state=0).fit(clf_X, clf_y)

valid_freq_cnt = valid_edge_index['src_id'].value_counts().reset_index().merge(
    valid_edge_index['dst_id'].value_counts().reset_index(),
    on='index',
    how='outer'
).fillna(0)

valid_freq_cnt['node_freq'] = valid_freq_cnt.src_id + valid_freq_cnt.dst_id

# px.bar(valid_freq_cnt.groupby('node_freq')[['index']].count().reset_index(), x='node_freq', y='index').show()

# valid_freq_cnt.groupby('node_freq')[['index']].count().reset_index()

valid_node_feature = valid_freq_cnt.merge(node_feature, on="index")
# valid_node_feature

val_X = valid_node_feature[valid_node_feature['y'].isin([0,1])][feature_cols].values
val_y = valid_node_feature[valid_node_feature['y'].isin([0,1])]['y'].values

pred_val = clf.predict(val_X)

from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(
    val_y, pred_val
)
auc = metrics.auc(fpr, tpr)

# AP
precision, recall, thresholds = metrics.precision_recall_curve(
    val_y, pred_val
)
ap = metrics.auc(recall, precision)

auc, ap


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



(0.5, 0.5042017400645986)

In [114]:
px.bar(train_node_feature.groupby(['y'])[['index']].count())

In [120]:
# Frequency of Anomaly nodes
px.bar(train_node_feature[train_node_feature['y'] == 1].groupby(['node_freq'])[['index']].count())

# Observation: Node feature evolves only within the time window, going over the time window may reset the input node features when message passing

# Observation: Training data includes a lot noise mixed with normal and abnormal which could impact SVDD's approach

In [None]:
window_cnt = train_edge_index.merge(
    node_feature[["index", "y"]].rename(columns={"index": "src_id", "y": "src_y"}),
    on="src_id",
).merge(
    node_feature[["index", "y"]].rename(columns={"index": "dst_id", "y": "dst_y"}),
    on="dst_id",
)

window_cnt['is_anomaly'] = window_cnt[["src_y", "dst_y"]].apply(
    lambda x: True if x["src_y"] == 1 or x["dst_y"] == 1 else False, axis=1
)

In [121]:
# TODO: get the total cnt for pct of anomaly nodes in each timestamp
window_cnt.groupby(['timestamp'])

Unnamed: 0,src_id,dst_id,timestamp,edge_type,src_y,dst_y,is_anomaly
0,1810566,1361425,1,10,0,0,False
1,1810566,1435961,10,10,0,2,False
2,1783155,1544039,1,11,2,3,False
3,554407,1544039,16,11,0,3,False
4,1728394,2239849,1,10,2,2,False
...,...,...,...,...,...,...,...
857894,322104,3684392,821,5,3,2,False
857895,3683404,3490494,821,2,2,2,False
857896,3683543,2721874,821,8,2,0,False
857897,3683498,3595997,821,5,2,2,False


In [None]:
px.line(window_cnt.groupby(['timestamp'])[['is_anomaly']].count().reset_index(), x='timestamp', y='is_anomaly')

# Observation: Majority of anomaly nodes having edge type X?