In [89]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px
import json

import torch
from torch_geometric.utils import dense_to_sparse, to_dense_adj

# Load Data

In [2]:
data_path = "/Users/jl102430/Documents/study/anomaly_detection/data/dynamic/DGraph/DGraphFin/dgraphfin.npz"


In [3]:
data = np.load(data_path)
data

<numpy.lib.npyio.NpzFile at 0x7fad8e3c2f90>

In [4]:
X = data['x']
y = data['y']

edge_index = data['edge_index']
edge_type = data['edge_type']
edge_timestamp = data['edge_timestamp']

train_mask = data['train_mask']
valid_mask = data['valid_mask']
test_mask = data['test_mask']


print(f"""
X shape: {X.shape},
y shape: {y.shape}

edge_index shape: {edge_index.shape}
edge_type shape: {edge_type.shape}
edge_timestamp shape: {edge_timestamp.shape}

train_mask shape: {train_mask.shape}
valid_mask shape: {valid_mask.shape}
test_mask shape: {test_mask.shape}
""")


X shape: (3700550, 17),
y shape: (3700550,)

edge_index shape: (4300999, 2)
edge_type shape: (4300999,)
edge_timestamp shape: (4300999,)

train_mask shape: (857899,)
valid_mask shape: (183862,)
test_mask shape: (183840,)



# Prepare Edge Index

In [5]:
def get_data(mask):

    _edges = pd.DataFrame(edge_index[mask], columns=['src_id', 'dst_id'])
    _edges ['edge_type'] = edge_type[mask]
    _edges ['edge_timestamp'] = edge_timestamp[mask]
    _edges ['trace_id'] = _edges ['edge_timestamp'] - 1

    return _edges.sort_values('edge_timestamp')

train_edges = get_data(train_mask)
test_edges = get_data(test_mask)
valid_edges = get_data(valid_mask)

train_edges 

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp,trace_id
559204,1810566,1361425,10,1,0
273657,1783155,1544039,11,1,0
775612,1728394,2239849,10,1,0
240611,1886055,683274,10,1,0
348220,2203323,773310,11,1,0
...,...,...,...,...,...
482142,3683404,3490494,2,821,820
427975,3683543,2721874,8,821,820
457712,394482,936197,5,821,820
13498,3683498,3595997,5,821,820


In [6]:
test_edges

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp,trace_id
110814,682425,1496933,9,1,0
2588,1911080,2199706,9,1,0
25920,5388,1223207,9,1,0
159566,666234,1265083,11,1,0
79902,204916,400718,10,1,0
...,...,...,...,...,...
93365,3683908,408379,5,821,820
80122,3677127,914834,5,821,820
57482,2975154,1029181,5,821,820
144417,606050,2370665,4,821,820


In [7]:
valid_edges

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp,trace_id
50585,1415795,543208,10,1,0
177262,1736265,1592619,10,1,0
173483,2195490,2276340,9,1,0
166082,1884566,1879177,10,1,0
27370,577700,24468,9,1,0
...,...,...,...,...,...
11284,185828,1669105,5,821,820
40701,3683854,3145323,5,821,820
121197,971252,3684448,4,821,820
127976,3684515,301434,5,821,820


In [8]:
px.bar(
    train_edges.groupby('trace_id')['src_id'].count().reset_index(),
    x='trace_id',
    y='src_id',
    title='Train Data - Graph Size by Timestamp (trace_id)'
).show()

px.bar(
    test_edges.groupby('trace_id')['src_id'].count().reset_index(),
    x='trace_id',
    y='src_id',
    title='Test Data - Graph Size by Timestamp (trace_id)'
).show()

px.bar(
    valid_edges.groupby('trace_id')['src_id'].count().reset_index(),
    x='trace_id',
    y='src_id',
    title='Valid Data - Graph Size by Timestamp (trace_id)'
).show()


In [9]:
(train_edges[['src_id', 'dst_id']].max(), train_edges[['src_id', 'dst_id']].min()
,valid_edges[['src_id', 'dst_id']].max(), valid_edges[['src_id', 'dst_id']].min()
,test_edges[['src_id', 'dst_id']].max(), test_edges[['src_id', 'dst_id']].min()
)

(src_id    3699082
 dst_id    3700549
 dtype: int64,
 src_id    38
 dst_id     2
 dtype: int64,
 src_id    3698951
 dst_id    3700536
 dtype: int64,
 src_id     3
 dst_id    45
 dtype: int64,
 src_id    3699043
 dst_id    3700527
 dtype: int64,
 src_id    18
 dst_id    28
 dtype: int64)

# Prepare Node Feature

In [10]:
node_feature = pd.DataFrame(X, columns=[f"feat_{i}" for i in range(X.shape[1])])
node_feature['y'] = y
node_feature = node_feature.reset_index().rename(columns={'index': 'node_id'})

node_feature

Unnamed: 0,node_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y
0,0,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
1,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3
2,2,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3,3,1.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3
4,4,1.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3700545,3700545,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3700546,3700546,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3700547,3700547,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2
3700548,3700548,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2


# Prepare Train/Valid/Test Dataset

In [37]:
def expand_edges(_edges):
    _edges_expand = _edges.merge(
        node_feature[['node_id', 'y']],
        left_on='src_id',
        right_on='node_id',
        how='left'
    ).merge(
        node_feature[['node_id', 'y']],
        left_on='dst_id',
        right_on='node_id',
        how='left'
    ).drop(['node_id_x', 'node_id_y'], axis=1)
    return _edges_expand

train_edges_expand = expand_edges(train_edges)
valid_edges_expand = expand_edges(valid_edges)
test_edges_expand = expand_edges(test_edges)



In [39]:
train_edges_expand

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp,trace_id,y_x,y_y
0,1810566,1361425,10,1,0,0,0
1,1783155,1544039,11,1,0,2,3
2,1728394,2239849,10,1,0,2,2
3,1886055,683274,10,1,0,0,2
4,2203323,773310,11,1,0,3,3
...,...,...,...,...,...,...,...
857894,3683404,3490494,2,821,820,2,2
857895,3683543,2721874,8,821,820,2,0
857896,394482,936197,5,821,820,0,3
857897,3683498,3595997,5,821,820,2,2


In [38]:
valid_edges_expand

Unnamed: 0,src_id,dst_id,edge_type,edge_timestamp,trace_id,y_x,y_y
0,1415795,543208,10,1,0,2,3
1,1736265,1592619,10,1,0,0,2
2,2195490,2276340,9,1,0,3,2
3,1884566,1879177,10,1,0,2,3
4,577700,24468,9,1,0,0,0
...,...,...,...,...,...,...,...
183857,185828,1669105,5,821,820,0,0
183858,3683854,3145323,5,821,820,2,2
183859,971252,3684448,4,821,820,0,2
183860,3684515,301434,5,821,820,3,2


In [12]:
# tx_ = train_edges_expand.groupby(['trace_id', 'y_x'])['src_id'].count().reset_index()
# tx_[tx_['y_x'] == 1].sort_values('src_id', ascending=False)

Unnamed: 0,trace_id,y_x,src_id
21,5,1,15
877,220,1,15
77,19,1,14
113,28,1,13
45,11,1,13
...,...,...,...
2087,534,1,1
2094,536,1,1
2138,548,1,1
2146,550,1,1


In [13]:
# train_edges_no_abnormal = train_edges.merge(
#     node_feature[node_feature.y!=1][['node_id', 'y']],
#     left_on='src_id',
#     right_on='node_id',
#     how='inner'
# ).merge(
#     node_feature[node_feature.y!=1][['node_id', 'y']],
#     left_on='dst_id',
#     right_on='node_id',
#     how='inner'
# ).drop(['node_id_x', 'node_id_y'], axis=1)
# train_edges_no_abnormal

In [40]:
def resolve_node_list(_edges_expand):
    _node_list = _edges_expand.groupby('trace_id')['src_id'].apply(list).reset_index()
    _node_list['dst_id'] = _edges_expand.groupby('trace_id')['dst_id'].apply(list)
    _node_list['node_list'] = _node_list['src_id'] + _node_list['dst_id']
    _node_list = _node_list.drop(['src_id', 'dst_id'], axis=1)
    return _node_list

train_node_list = resolve_node_list(train_edges_expand)
valid_node_list = resolve_node_list(valid_edges_expand)
test_node_list = resolve_node_list(test_edges_expand)
train_node_list

Unnamed: 0,trace_id,node_list
0,0,"[1810566, 1783155, 1728394, 1886055, 2203323, ..."
1,1,"[1129783, 1696847, 1101582, 2168082, 1030829, ..."
2,2,"[1616312, 701554, 1961404, 2049740, 2013065, 2..."
3,3,"[1793668, 1599852, 181698, 677594, 1962654, 21..."
4,4,"[1942810, 1726572, 833359, 1022321, 1767936, 3..."
...,...,...
816,816,"[1793909, 3679416, 1238490, 1504195, 3679426, ..."
817,817,"[3426684, 1298485, 479583, 2350693, 3210801, 2..."
818,818,"[3681531, 1405090, 3681860, 3681745, 3681968, ..."
819,819,"[974361, 1956140, 3682268, 29841, 1155769, 253..."


In [41]:
valid_node_list

Unnamed: 0,trace_id,node_list
0,0,"[1415795, 1736265, 2195490, 1884566, 577700, 2..."
1,1,"[1986817, 1689969, 1404129, 2239796, 659998, 1..."
2,2,"[1817741, 1107930, 1855598, 1949802, 2267943, ..."
3,3,"[2116361, 2253723, 2188589, 1291589, 1619269, ..."
4,4,"[1644307, 2138032, 2270494, 805907, 1649299, 2..."
...,...,...
816,816,"[1217920, 3180159, 3678565, 114226, 3679060, 3..."
817,817,"[2523113, 1622415, 3677173, 3680293, 3378422, ..."
818,818,"[3681299, 3634144, 3677051, 184518, 3678566, 4..."
819,819,"[3683211, 3682791, 3670601, 249366, 1603777, 1..."


In [42]:
pd.options.mode.chained_assignment = None


In [44]:
# TODO: do it for all the traces
def resolve_node_features(_node_list):
    _node_feature_list = []
    for trace_idx in tqdm(_node_list.trace_id):
        _node_feature = node_feature[
            node_feature.node_id.isin(_node_list[_node_list.trace_id==trace_idx]['node_list'].tolist()[0])
        ]
        _node_feature.loc[:, 'trace_id'] = trace_idx

        _node_feature = _node_feature.reset_index(drop=True).reset_index()
        _node_feature_list.append(_node_feature)
        # break

    _node_feature = pd.concat(_node_feature_list).rename(columns={"index": "new_node_id"})
    return _node_feature

train_node_feature = resolve_node_features(train_node_list)
valid_node_feature = resolve_node_features(valid_node_list)
test_node_feature = resolve_node_features(test_node_list)

100%|██████████| 821/821 [01:01<00:00, 13.41it/s]
100%|██████████| 821/821 [01:03<00:00, 12.94it/s]
100%|██████████| 821/821 [01:02<00:00, 13.13it/s]


In [45]:
train_node_feature


Unnamed: 0,new_node_id,node_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,...,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y,trace_id
0,0,424,1.0,8.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0
1,1,1093,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0
2,2,1094,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0
3,3,1279,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0
4,4,3090,1.0,5.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,1807,3688221,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,820
1808,1808,3689491,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,820
1809,1809,3693274,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,0,820
1810,1810,3695889,-1.0,-1.0,0.74,0.705,0.952703,0.3,0.247,0.005,...,0.967611,0.0,0.047297,0.032389,0.021277,0.333333,-1.0,-1.0,2,820


In [47]:
valid_node_feature

Unnamed: 0,new_node_id,node_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,...,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y,trace_id
0,0,1080,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,0
1,1,7712,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0
2,2,8194,1.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3,0
3,3,9671,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,0
4,4,24468,0.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,427,3685595,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,820
428,428,3685737,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,820
429,429,3686006,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,820
430,430,3686326,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,820


In [48]:
def resolve_new_node_id(_edges, _node_feature):
    _edges = _edges.merge(
        _node_feature[["trace_id", "node_id", "new_node_id"]],
        left_on=["trace_id", "src_id"],
        right_on=["trace_id", "node_id"],
        how="inner",
    ).rename(
        columns={"new_node_id": "new_src_id"}
    ).drop('node_id', axis=1).merge(
        _node_feature[["trace_id", "node_id", "new_node_id"]],
        left_on=["trace_id", "dst_id"],
        right_on=["trace_id", "node_id"],
        how="inner",
    ).rename(
        columns={"new_node_id": "new_dst_id"}
    ).drop('node_id', axis=1)
    
    return _edges.drop(['src_id', 'dst_id'], axis=1).rename(
        columns={
            "new_src_id": "src_id",
            "new_dst_id": "dst_id",
        }
    )

new_train_edges = resolve_new_node_id(train_edges_expand, train_node_feature)
new_valid_edges = resolve_new_node_id(valid_edges_expand, valid_node_feature)
new_test_edges = resolve_new_node_id(test_edges_expand, test_node_feature)

new_train_edges

Unnamed: 0,edge_type,edge_timestamp,trace_id,y_x,y_y,src_id,dst_id
0,10,1,0,0,0,1242,835
1,11,1,0,2,3,1215,968
2,10,1,0,2,2,1148,1658
3,10,1,0,0,2,1331,411
4,11,1,0,3,3,1628,473
...,...,...,...,...,...,...,...
857894,5,821,820,3,2,124,1719
857895,2,821,820,2,2,1422,1286
857896,8,821,820,2,0,1454,970
857897,5,821,820,2,2,1443,1329


In [49]:
new_valid_edges

Unnamed: 0,edge_type,edge_timestamp,trace_id,y_x,y_y,src_id,dst_id
0,10,1,0,2,3,178,58
1,10,1,0,0,2,244,203
2,9,1,0,3,2,357,376
3,10,1,0,2,3,291,288
4,9,1,0,0,0,63,4
...,...,...,...,...,...,...,...
183857,5,821,820,0,0,14,146
183858,5,821,820,2,2,366,276
183859,4,821,820,0,2,79,415
183860,5,821,820,3,2,421,27


In [50]:
new_test_edges

Unnamed: 0,edge_type,edge_timestamp,trace_id,y_x,y_y,src_id,dst_id
0,9,1,0,3,2,83,190
1,9,1,0,2,2,275,356
2,9,1,0,0,0,0,150
3,11,1,0,0,2,81,158
4,10,1,0,0,2,25,46
...,...,...,...,...,...,...,...
183835,5,821,820,2,2,337,33
183836,5,821,820,2,3,295,76
183837,5,821,820,3,0,235,82
183838,4,821,820,0,2,52,186


In [None]:
# Node Types


# Output Datasets

In [52]:
# Output Edge Index
new_train_edges.drop('edge_timestamp', axis=1).to_csv('../dataset/train_edge_index.csv', index=False)
new_valid_edges.drop('edge_timestamp', axis=1).to_csv('../dataset/valid_edge_index.csv', index=False)
new_test_edges.drop('edge_timestamp', axis=1).to_csv('../dataset/test_edge_index.csv', index=False)

In [63]:
def min_max_normalise(df):
    return (df - df.min()) / (df.max() - df.min())


def normalise(df, skip_id=[0]):
    _new_df = min_max_normalise(
        df[[f"feat_{i}" for i in range(X.shape[1]) if i not in skip_id]]
    )
    _new_df["trace_id"] = df["trace_id"]
    _new_df["node_id"] = df["node_id"]
    _new_df["y"] = df["y"]
    return _new_df

In [66]:
# Use feat_0 as node type
train_node_feature_norm = normalise(
    train_node_feature.drop(["node_id"], axis=1).rename(
        columns={"new_node_id": "node_id"}
    ),
    skip_id=[0]
)
valid_node_feature_norm = normalise(
    valid_node_feature.drop(["node_id"], axis=1).rename(
        columns={"new_node_id": "node_id"}
    ),
    skip_id=[0]
)
test_node_feature_norm = normalise(
    test_node_feature.drop(["node_id"], axis=1).rename(
        columns={"new_node_id": "node_id"}
    ),
    skip_id=[0]
)

train_node_feature_norm

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,trace_id,node_id,y
0,1.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,0,2
1,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,1,2
2,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,2,2
3,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,3,2
4,0.666667,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,820,1807,2
1808,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,820,1808,2
1809,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,820,1809,0
1810,0.000000,0.015593,0.01528,0.976351,0.004964,0.017905,0.075349,0.017949,0.983806,0.0,0.523649,0.516194,0.510638,0.333333,0.0,0.0,820,1810,2


In [69]:
valid_node_feature_norm

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,trace_id,node_id,y
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2
1,0.555556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
2,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,3
3,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,2
4,0.888889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,820,427,2
428,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,820,428,2
429,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,820,429,2
430,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,820,430,2


In [70]:
# Output Node Feature Norm
train_node_feature_norm.to_csv('../dataset/train_node_feature_norm.csv', index=False)
valid_node_feature_norm.to_csv('../dataset/valid_node_feature_norm.csv', index=False)
test_node_feature_norm.to_csv('../dataset/test_node_feature_norm.csv', index=False)

In [75]:
new_train_edges.edge_type.unique().shape, new_valid_edges.edge_type.unique().shape, new_test_edges.edge_type.unique().shape

((11,), (11,), (11,))

In [82]:
# Node Type, as feat_0
type_map = {
    0.0: 0,
    1.0: 1,
    -1.0: 2
}
train_node_feature['node_type'] = train_node_feature['feat_0'].apply(lambda x: type_map[x])
valid_node_feature['node_type'] = valid_node_feature['feat_0'].apply(lambda x: type_map[x])
test_node_feature['node_type'] = test_node_feature['feat_0'].apply(lambda x: type_map[x])


Unnamed: 0,new_node_id,node_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,...,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,y,trace_id,node_type
0,0,424,1.0,8.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0,1
1,1,1093,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0,2
2,2,1094,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0,2
3,3,1279,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0,2
4,4,3090,1.0,5.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,1807,3688221,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,820,2
1808,1808,3689491,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,2,820,2
1809,1809,3693274,-1.0,-1.0,-1.00,-1.000,-1.000000,-1.0,-1.000,-1.000,...,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,0,820,2
1810,1810,3695889,-1.0,-1.0,0.74,0.705,0.952703,0.3,0.247,0.005,...,0.0,0.047297,0.032389,0.021277,0.333333,-1.0,-1.0,2,820,2


In [91]:
def write_node_type(dataset_type, _node_feature):
    node_type_list = _node_feature.groupby(['trace_id', 'node_type'])['node_id'].apply(list) \
        .reset_index().sort_values(['trace_id', 'node_type'])

    for gid in tqdm(sorted(node_type_list.trace_id.unique())):
        type_list = []
        for i in range(node_type_list.node_type.max() + 1):
            lst_value = node_type_list[(node_type_list.trace_id == gid) & (node_type_list.node_type == i)]['node_id'].values.tolist()

            if len(lst_value) == 0: 
                type_list.append([])
            else:
                type_list.append(lst_value[0])

        with open(f'../dataset/{dataset_type}_node_types.txt', 'a') as fout:
            fout.write(json.dumps(type_list))
            fout.write('\n')

write_node_type('train', train_node_feature)
write_node_type('valid', valid_node_feature)
write_node_type('test', test_node_feature)



100%|██████████| 821/821 [00:03<00:00, 213.71it/s]
100%|██████████| 821/821 [00:03<00:00, 243.17it/s]
100%|██████████| 821/821 [00:03<00:00, 249.92it/s]
