In [2]:
import pandas as pd
import plotly.express as px
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import json

In [3]:
graph_data_path_root = '../GraphData'
output_root_dir = '../HetGNN/ProcessedData_clean'

## Meta Files

In [20]:
id_service_df = pd.read_csv(f'{graph_data_path_root}/id_service.csv', index_col=0)
id_service_df

Unnamed: 0,Service
0,ts-order-service
1,ts-station-service
2,ts-travel2-service
3,ts-ticketinfo-service
4,ts-basic-service
5,ts-route-service
6,ts-train-service
7,ts-price-service
8,ts-order-other-service
9,ts-seat-service


In [21]:
id_url_temp_df = pd.read_csv(f'{graph_data_path_root}/id_url+temp.csv', index_col=0)
id_url_temp_df

Unnamed: 0,EventTemplate
0,"draw back payment, userId: <:UNIQUE_ID:>, mone..."
1,[Draw Back Money] Draw back money...
2,[Cancel Order Service] Delay Process，Wrong Can...
3,[Cancel Order] Order <:*:> <:*:>
4,[Get Order By Id] Order Id: <:*:>
...,...
1508,{POST}/api/v1/routeplanservice/routePlan/minSt...
1509,{GET}/api/v1/routeservice/routes/{startId}/{te...
1510,{POST}/api/v1/travelservice/trips/routes
1511,{POST}/api/v1/travel2service/trips/routes


In [22]:
id_url_type_df = pd.read_csv(f'{graph_data_path_root}/id_url+type.csv', index_col=0)
id_url_type_df

Unnamed: 0,Temp,Type,Content
0,0,4,"draw back payment, userId: <:UNIQUE_ID:>, mone..."
1,1,4,[Draw Back Money] Draw back money...
2,2,4,[Cancel Order Service] Delay Process，Wrong Can...
3,3,4,[Cancel Order] Order <:*:> <:*:>
4,4,4,[Get Order By Id] Order Id: <:*:>
...,...,...,...
1873,1511,3,Client Response {POST}/api/v1/travel2service/t...
1874,1512,0,Server Request /api/v1/consignpriceservice/con...
1875,1512,2,Server Response /api/v1/consignpriceservice/co...
1876,1512,1,Client Request /api/v1/consignpriceservice/con...


In [23]:
id_url_type_df.Type.unique()

array([4, 0, 2, 1, 3, 5, 6, 7])

In [24]:
id_url_temp_df.iloc[1512].values

array(['/api/v1/consignpriceservice/consignprice/0.0/false'], dtype=object)

In [25]:
id_url_type_df.iloc[1874].values, id_url_type_df.iloc[1875].values, id_url_type_df.iloc[1876].values, id_url_type_df.iloc[1877].values

(array([1512, 0,
        'Server Request /api/v1/consignpriceservice/consignprice/0.0/false'],
       dtype=object),
 array([1512, 2,
        'Server Response /api/v1/consignpriceservice/consignprice/0.0/false'],
       dtype=object),
 array([1512, 1,
        'Client Request /api/v1/consignpriceservice/consignprice/0.0/false'],
       dtype=object),
 array([1512, 3,
        'Client Response /api/v1/consignpriceservice/consignprice/0.0/false'],
       dtype=object))

## Process Files

In [26]:
import json
with open(f'{graph_data_path_root}/process0.jsons', 'r') as fin:
    line = fin.readline()
sample = json.loads(line)
sample.keys()

dict_keys(['edge_index', 'edge_attr', 'node_info', 'trace_id', 'trace_bool', 'error_trace_type'])

In [27]:
for k in sample.keys():
    if isinstance(sample[k], list):
        print(f'{k} size: {len(sample[k])}')
    else:
        print(f'{k}: {sample[k]}')

edge_index size: 962
edge_attr size: 962
node_info size: 841
trace_id: 007efb9578bc4f1ab3eab03cb5188af5.38.16292558869030095
trace_bool: True
error_trace_type: normal


In [28]:
import numpy as np
np.unique(np.array(sample['edge_index']).flatten()).max()

840

In [29]:
set(sample['edge_attr'])

{0, 1, 2}

In [30]:
sample['node_info'][:10]
# for i in range(np.array(sample['node_info']).shape[1]):
#     print(f"{i}: {np.unique(np.array(sample['node_info'])[:, i])}")


[[1460.0, 13, -1, 0, 0, 0, 1666],
 [1460.0, 13, -1, 922, 2, 0, 1667],
 [502.0, 13.0, -1, 2.0, 4, 0.0, 502],
 [1364.0, 13.0, 13, 3.0, 1, 1.0, 1375],
 [1364.0, 13.0, 13, 10.0, 1, 2.0, 1375],
 [1364.0, 13.0, 13, 10.0, 3, 1.0, 1376],
 [44.0, 13.0, -1, 10.0, 4, 0.0, 44],
 [1364.0, 13.0, 13, 17.0, 3, 2.0, 1376],
 [44.0, 13.0, -1, 17.0, 4, 0.0, 44],
 [1388.0, 13.0, 13, 18.0, 5, 3.0, 1448]]

### Trace Info Overview

In [31]:
from collections import defaultdict
from tqdm import tqdm

In [32]:
trace_info = defaultdict(list)

for idx in range(0, 9):
    with open(f'{graph_data_path_root}/process{idx}.jsons', 'r') as fin:
        for line in tqdm(fin.readlines()):
            trace = json.loads(line)
            trace_info['trace_id'].append(trace['trace_id'])
            trace_info['trace_bool'].append(trace['trace_bool'])
            trace_info['error_trace_type'].append(trace['error_trace_type'])
            trace_info['process_idx'].append(idx)

trace_info_df = pd.DataFrame(trace_info)
trace_info_df

100%|██████████| 14721/14721 [00:04<00:00, 3471.33it/s]
100%|██████████| 14721/14721 [00:04<00:00, 3459.60it/s]
100%|██████████| 14721/14721 [00:04<00:00, 3505.14it/s]
100%|██████████| 14721/14721 [00:04<00:00, 3538.75it/s]
100%|██████████| 14721/14721 [00:04<00:00, 3543.14it/s]
100%|██████████| 14721/14721 [00:04<00:00, 3525.39it/s]
100%|██████████| 14720/14720 [00:04<00:00, 3543.90it/s]
100%|██████████| 14720/14720 [00:04<00:00, 3630.16it/s]
100%|██████████| 14719/14719 [00:04<00:00, 3540.19it/s]


Unnamed: 0,trace_id,trace_bool,error_trace_type,process_idx
0,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...,True,normal,0
1,007efb9578bc4f1ab3eab03cb5188af5.38.1629255918...,True,normal,0
2,007efb9578bc4f1ab3eab03cb5188af5.38.1629255966...,True,normal,0
3,007efb9578bc4f1ab3eab03cb5188af5.38.1629255999...,True,normal,0
4,007efb9578bc4f1ab3eab03cb5188af5.38.1629256047...,True,normal,0
...,...,...,...,...
132480,ffc3e79734424b65942466e0d8d432e2.46.1628705867...,False,F04-02,8
132481,ffc3e79734424b65942466e0d8d432e2.47.1628705858...,False,F04-02,8
132482,ffc3e79734424b65942466e0d8d432e2.47.1628705861...,False,F04-02,8
132483,ffc3e79734424b65942466e0d8d432e2.47.1628705865...,False,F04-02,8


In [33]:
trace_info_df.error_trace_type.unique()

array(['normal', 'F02-06', 'F07-04', 'F07-05', 'F04-06', 'F06-05',
       'F01-03', 'F05-01', 'F12-01', 'F01-02', 'F05-03', 'F06-04',
       'F05-02', 'F14-03', 'F10-02', 'F03-07', 'F03-08', 'F13-03',
       'F07-02', 'F02-04', 'F11-02', 'F11-01', 'F04-01', 'F08-01',
       'F10-01', 'F10-03', 'F09-01', 'F09-03', 'F02-01', 'F02-02',
       'F02-03', 'F04-03', 'F13-05', 'F07-01', 'F08-04', 'F14-02',
       'F08-02', 'F12-03', 'F03-03', 'F04-07', 'F04-08', 'F06-01',
       'F06-02', 'F06-03', 'F13-04', 'F13-01', 'F01-01', 'F08-05',
       'F01-04', 'F01-05', 'F11-05', 'F11-04', 'F11-03', 'F08-03',
       'F03-01', 'F05-05', 'F07-03', 'F09-02', 'F03-04', 'F04-05',
       'F03-06', 'F03-05', 'F12-05', 'F05-04', 'F02-05', 'F03-02',
       'F14-01', 'F12-02', 'F13-02', 'F04-04', 'F04-02'], dtype=object)

In [63]:
# trace_info_df['trace_id'] = trace_info_df['trace_id'].apply(lambda x: trace_id_idx[x])
trace_info_df.to_csv(f'{output_root_dir}/trace_info.csv', index=False)
trace_idx_to_id = {}
trace_id_to_idx = {}
for d in trace_info_df.reset_index()[['index', 'trace_id']].apply(lambda x: {x[0]:x[1]}, axis=1).values:
    for k, v in d.items():
        trace_idx_to_id[k] = v
        trace_id_to_idx[v] = k

In [36]:
px.histogram(trace_info_df, x='trace_bool', color='process_idx')

In [37]:
px.histogram(trace_info_df, x=['error_trace_type'], color='process_idx')

### Process Edges

In [66]:
import json
edge_details = defaultdict(list)
for idx in range(0, 9):
    with open(f'{graph_data_path_root}/process{idx}.jsons', 'r') as fin:
        for line in tqdm(fin.readlines()):
            trace = json.loads(line)
            node_info = trace['node_info']
            for (src_id, dst_id), edge_type in zip(trace['edge_index'], trace['edge_attr']):
                edge_details['src_id'].append(src_id)
                edge_details['src_type'].append(node_info[src_id][4])
                edge_details['dst_id'].append(dst_id)
                edge_details['dst_type'].append(node_info[dst_id][4])
                edge_details['edge_type'].append(edge_type)
                edge_details['trace_id'].append(trace['trace_id'])
edge_details_df = pd.DataFrame(edge_details)
edge_details_df

100%|██████████| 14721/14721 [00:10<00:00, 1403.84it/s]
100%|██████████| 14721/14721 [00:11<00:00, 1288.26it/s]
100%|██████████| 14721/14721 [00:12<00:00, 1207.15it/s]
100%|██████████| 14721/14721 [00:14<00:00, 1021.69it/s]
100%|██████████| 14721/14721 [00:15<00:00, 947.57it/s] 
100%|██████████| 14721/14721 [00:17<00:00, 850.23it/s] 
100%|██████████| 14720/14720 [00:22<00:00, 661.94it/s] 
100%|██████████| 14720/14720 [00:27<00:00, 538.12it/s] 
100%|██████████| 14719/14719 [00:25<00:00, 579.53it/s] 


Unnamed: 0,src_id,src_type,dst_id,dst_type,edge_type,trace_id
0,0,0,2,4,0,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...
1,2,4,3,1,0,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...
2,3,1,4,1,0,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...
3,4,1,5,3,0,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...
4,5,3,6,4,0,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...
...,...,...,...,...,...,...
29677741,14,4,15,4,0,ffc3e79734424b65942466e0d8d432e2.47.1628705868...
29677742,15,4,16,3,0,ffc3e79734424b65942466e0d8d432e2.47.1628705868...
29677743,16,3,17,4,0,ffc3e79734424b65942466e0d8d432e2.47.1628705868...
29677744,17,4,18,4,0,ffc3e79734424b65942466e0d8d432e2.47.1628705868...


In [67]:
with open(f'{output_root_dir}/trace_id_to_idx.json', 'w') as fout:
    fout.write(json.dumps(trace_id_to_idx))
    fout.write('\n')

with open(f'{output_root_dir}/trace_idx_to_id.json', 'w') as fout:
    fout.write(json.dumps(trace_idx_to_id))
    fout.write('\n')

In [68]:
edge_details_df['trace_id'] = edge_details_df['trace_id'].apply(lambda x: trace_id_to_idx[x])
edge_details_df

Unnamed: 0,src_id,src_type,dst_id,dst_type,edge_type,trace_id
0,0,0,2,4,0,0
1,2,4,3,1,0,0
2,3,1,4,1,0,0
3,4,1,5,3,0,0
4,5,3,6,4,0,0
...,...,...,...,...,...,...
29677741,14,4,15,4,0,132484
29677742,15,4,16,3,0,132484
29677743,16,3,17,4,0,132484
29677744,17,4,18,4,0,132484


In [69]:
relation_neigh_df = edge_details_df.groupby(['trace_id', 'src_type', 'dst_type', 'src_id']).agg(
    {
        'dst_id': lambda x: x.tolist()
    }
).reset_index()

relation_neigh_df

Unnamed: 0,trace_id,src_type,dst_type,src_id,dst_id
0,0,0,1,74,[76]
1,0,0,1,79,[81]
2,0,0,1,89,[91]
3,0,0,1,94,[96]
4,0,0,1,112,[114]
...,...,...,...,...,...
29674617,132484,4,4,12,[13]
29674618,132484,4,4,13,[14]
29674619,132484,4,4,14,[15]
29674620,132484,4,4,17,[18]


In [46]:
relation_neigh_df['dst_id'].apply(lambda x: len(x)).max()

2

In [28]:
relation_neigh_df['dst_id_list'] = relation_neigh_df['dst_id'].apply(lambda x: ','.join([str(i) for i in x]))
relation_neigh_df

Unnamed: 0,trace_id,src_type,dst_type,src_id,dst_id,dst_id_list
0,0,0,1,74,[76],76
1,0,0,1,79,[81],81
2,0,0,1,89,[91],91
3,0,0,1,94,[96],96
4,0,0,1,112,[114],114
...,...,...,...,...,...,...
29674617,132484,4,4,12,[13],13
29674618,132484,4,4,13,[14],14
29674619,132484,4,4,14,[15],15
29674620,132484,4,4,17,[18],18


In [31]:
for src_type, dst_type in unique_type_relations[['src_type', 'dst_type']].values:
    print(f'Processing {src_type}_{dst_type}')
    cond = (relation_neigh_df['src_type'] == src_type) & (relation_neigh_df['dst_type'] == dst_type)
    relation_neigh_df[['trace_id', 'src_id', 'dst_id_list']][cond].to_csv(f'{output_root_dir}/{src_type}_{dst_type}_list.txt', sep=':', index=None, header=False)


Processing 0_1
Processing 0_2
Processing 0_4
Processing 0_5
Processing 1_0
Processing 1_1
Processing 1_3
Processing 1_4
Processing 1_5
Processing 1_6
Processing 2_3
Processing 3_1
Processing 3_2
Processing 3_3
Processing 3_4
Processing 3_5
Processing 3_6
Processing 4_1
Processing 4_2
Processing 4_3
Processing 4_4
Processing 4_5
Processing 4_6
Processing 5_1
Processing 5_2
Processing 5_3
Processing 5_4
Processing 5_5
Processing 5_6
Processing 6_2
Processing 6_3
Processing 6_4
Processing 6_5
Processing 6_6
Processing 6_7
Processing 7_4
Processing 7_5


### Get Incoming Edge Embedding

In [32]:
edge_details_df

Unnamed: 0,src_id,src_type,dst_id,dst_type,edge_type,trace_id
0,0,0,2,4,0,0
1,2,4,3,1,0,0
2,3,1,4,1,0,0
3,4,1,5,3,0,0
4,5,3,6,4,0,0
...,...,...,...,...,...,...
29677741,14,4,15,4,0,132484
29677742,15,4,16,3,0,132484
29677743,16,3,17,4,0,132484
29677744,17,4,18,4,0,132484


In [36]:
incoming_edge_embeddings = pd.get_dummies(
    edge_details_df.drop(['src_type', 'src_id'], axis=1),
    columns=['edge_type'],
    prefix='e'
)

incoming_edge_embeddings


Unnamed: 0,dst_id,dst_type,trace_id,e_0,e_1,e_2,e_3
0,2,4,0,1,0,0,0
1,3,1,0,1,0,0,0
2,4,1,0,1,0,0,0
3,5,3,0,1,0,0,0
4,6,4,0,1,0,0,0
...,...,...,...,...,...,...,...
29677741,15,4,132484,1,0,0,0
29677742,16,3,132484,1,0,0,0
29677743,17,4,132484,1,0,0,0
29677744,18,4,132484,1,0,0,0


In [38]:
incoming_edge_embeddings.drop('dst_type', axis=1).groupby(['trace_id', 'dst_id']).sum().reset_index() \
    .to_csv(f'{output_root_dir}/incoming_edge_embedding.csv', index=False)

In [45]:
import os 
sorted(os.listdir('../HetGNN/ProcessedData'))

['0_1_list.txt',
 '0_2_list.txt',
 '0_4_list.txt',
 '0_5_list.txt',
 '1_0_list.txt',
 '1_1_list.txt',
 '1_3_list.txt',
 '1_4_list.txt',
 '1_5_list.txt',
 '1_6_list.txt',
 '2_3_list.txt',
 '3_1_list.txt',
 '3_2_list.txt',
 '3_3_list.txt',
 '3_4_list.txt',
 '3_5_list.txt',
 '3_6_list.txt',
 '4_1_list.txt',
 '4_2_list.txt',
 '4_3_list.txt',
 '4_4_list.txt',
 '4_5_list.txt',
 '4_6_list.txt',
 '5_1_list.txt',
 '5_2_list.txt',
 '5_3_list.txt',
 '5_4_list.txt',
 '5_5_list.txt',
 '5_6_list.txt',
 '6_2_list.txt',
 '6_3_list.txt',
 '6_4_list.txt',
 '6_5_list.txt',
 '6_6_list.txt',
 '6_7_list.txt',
 '7_4_list.txt',
 '7_5_list.txt',
 'incoming_edge_embedding.csv',
 'trace_id_idx.json']

In [66]:
trace_info_df = pd.read_csv(f'../HetGNN/ProcessedData/trace_info.csv', index_col=None)
trace_info_df

Unnamed: 0,trace_id,trace_bool,error_trace_type,process_idx
0,0,True,normal,0
1,1,True,normal,0
2,2,True,normal,0
3,3,True,normal,0
4,4,True,normal,0
...,...,...,...,...
132480,132480,False,F04-02,8
132481,132481,False,F04-02,8
132482,132482,False,F04-02,8
132483,132483,False,F04-02,8


In [78]:
trace_info_df[trace_info_df['trace_bool']==True]

Unnamed: 0,trace_id,trace_bool,error_trace_type,process_idx
0,0,True,normal,0
1,1,True,normal,0
2,2,True,normal,0
3,3,True,normal,0
4,4,True,normal,0
...,...,...,...,...
132440,132440,True,normal,8
132441,132441,True,normal,8
132442,132442,True,normal,8
132443,132443,True,normal,8


In [87]:
trace_info_df[trace_info_df['trace_id'].isin([1,2])]['trace_bool'].apply(lambda x: 0 if x else 1)

1    0
2    0
Name: trace_bool, dtype: int64

## Using Node Attr embedding

In [5]:
import json
node_embedding = defaultdict(list)
for idx in range(0, 9):
    with open(f'{graph_data_path_root}/process{idx}.jsons', 'r') as fin:
        for line in tqdm(fin.readlines()):
            trace = json.loads(line)
            node_info = trace['node_info']
            for i, vector in enumerate(node_info):
                node_embedding['trace_id'].append(trace['trace_id'])
                node_embedding['node_id'].append(i)
                for j, attr in enumerate(vector):
                    node_embedding[f'node_attr_{j}'].append(attr)
node_embedding_df = pd.DataFrame(node_embedding)
node_embedding_df

100%|██████████| 14721/14721 [00:17<00:00, 839.87it/s] 
100%|██████████| 14721/14721 [00:25<00:00, 584.41it/s] 
100%|██████████| 14721/14721 [00:31<00:00, 471.34it/s] 
100%|██████████| 14721/14721 [00:38<00:00, 383.79it/s] 
100%|██████████| 14721/14721 [00:45<00:00, 323.04it/s] 
100%|██████████| 14721/14721 [00:51<00:00, 286.05it/s] 
100%|██████████| 14720/14720 [00:59<00:00, 249.45it/s] 
100%|██████████| 14720/14720 [01:12<00:00, 202.96it/s]
100%|██████████| 14719/14719 [01:23<00:00, 175.57it/s] 


Unnamed: 0,trace_id,node_id,node_attr_0,node_attr_1,node_attr_2,node_attr_3,node_attr_4,node_attr_5,node_attr_6
0,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...,0,1460.0,13.0,-1,0.0,0,0.0,1666
1,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...,1,1460.0,13.0,-1,922.0,2,0.0,1667
2,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...,2,502.0,13.0,-1,2.0,4,0.0,502
3,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...,3,1364.0,13.0,13,3.0,1,1.0,1375
4,007efb9578bc4f1ab3eab03cb5188af5.38.1629255886...,4,1364.0,13.0,13,10.0,1,2.0,1375
...,...,...,...,...,...,...,...,...,...
27110858,ffc3e79734424b65942466e0d8d432e2.47.1628705868...,14,973.0,30.0,-1,2.0,4,0.0,973
27110859,ffc3e79734424b65942466e0d8d432e2.47.1628705868...,15,958.0,30.0,-1,2.0,4,0.0,958
27110860,ffc3e79734424b65942466e0d8d432e2.47.1628705868...,16,1496.0,30.0,30,3.0,3,2.0,1813
27110861,ffc3e79734424b65942466e0d8d432e2.47.1628705868...,17,1004.0,30.0,-1,3.0,4,0.0,1004


In [9]:
with open(f'{output_root_dir}/trace_id_to_idx.json','r') as fin:
    trace_id_to_idx = json.load(fin)

In [10]:
node_embedding_df['trace_id'] = node_embedding_df['trace_id'].apply(lambda x: trace_id_to_idx[x])
node_embedding_df

Unnamed: 0,trace_id,node_id,node_attr_0,node_attr_1,node_attr_2,node_attr_3,node_attr_4,node_attr_5,node_attr_6
0,0,0,1460.0,13.0,-1,0.0,0,0.0,1666
1,0,1,1460.0,13.0,-1,922.0,2,0.0,1667
2,0,2,502.0,13.0,-1,2.0,4,0.0,502
3,0,3,1364.0,13.0,13,3.0,1,1.0,1375
4,0,4,1364.0,13.0,13,10.0,1,2.0,1375
...,...,...,...,...,...,...,...,...,...
27110858,132484,14,973.0,30.0,-1,2.0,4,0.0,973
27110859,132484,15,958.0,30.0,-1,2.0,4,0.0,958
27110860,132484,16,1496.0,30.0,30,3.0,3,2.0,1813
27110861,132484,17,1004.0,30.0,-1,3.0,4,0.0,1004


In [11]:
node_embedding_df[node_embedding_df['trace_id'] == 0]

Unnamed: 0,trace_id,node_id,node_attr_0,node_attr_1,node_attr_2,node_attr_3,node_attr_4,node_attr_5,node_attr_6
0,0,0,1460.0,13.0,-1,0.0,0,0.0,1666
1,0,1,1460.0,13.0,-1,922.0,2,0.0,1667
2,0,2,502.0,13.0,-1,2.0,4,0.0,502
3,0,3,1364.0,13.0,13,3.0,1,1.0,1375
4,0,4,1364.0,13.0,13,10.0,1,2.0,1375
...,...,...,...,...,...,...,...,...,...
836,0,836,1372.0,6.0,13,913.0,0,337.0,1399
837,0,837,1372.0,6.0,13,919.0,2,337.0,1400
838,0,838,24.0,6.0,13,914.0,4,337.0,24
839,0,839,1373.0,6.0,6,917.0,5,338.0,1403


In [13]:
from sklearn.preprocessing import normalize

X_norm = normalize(node_embedding_df[node_embedding_df.columns[2:]].values,  axis=0)
X_norm

array([[ 2.35524549e-04,  2.67729971e-04, -1.83839488e-05, ...,
         0.00000000e+00,  0.00000000e+00,  2.62618626e-04],
       [ 2.35524549e-04,  2.67729971e-04, -1.83839488e-05, ...,
         1.03129485e-04,  0.00000000e+00,  2.62776260e-04],
       [ 8.09817286e-05,  2.67729971e-04, -1.83839488e-05, ...,
         2.06258969e-04,  0.00000000e+00,  7.91323832e-05],
       ...,
       [ 2.41332004e-04,  6.17838395e-04,  5.51518463e-04, ...,
         1.54694227e-04,  2.93618799e-06,  2.85790858e-04],
       [ 1.61963457e-04,  6.17838395e-04, -1.83839488e-05, ...,
         2.06258969e-04,  0.00000000e+00,  1.58264766e-04],
       [ 1.61963457e-04,  6.17838395e-04, -1.83839488e-05, ...,
         2.06258969e-04,  0.00000000e+00,  1.58264766e-04]])

In [14]:
for i in range(7):
    node_embedding_df[f'node_attr_{i}'] = X_norm[:,i]
node_embedding_df

Unnamed: 0,trace_id,node_id,node_attr_0,node_attr_1,node_attr_2,node_attr_3,node_attr_4,node_attr_5,node_attr_6
0,0,0,0.000236,0.000268,-0.000018,0.000000e+00,0.000000,0.000000,0.000263
1,0,1,0.000236,0.000268,-0.000018,9.548828e-06,0.000103,0.000000,0.000263
2,0,2,0.000081,0.000268,-0.000018,2.071329e-08,0.000206,0.000000,0.000079
3,0,3,0.000220,0.000268,0.000239,3.106994e-08,0.000052,0.000001,0.000217
4,0,4,0.000220,0.000268,0.000239,1.035665e-07,0.000052,0.000003,0.000217
...,...,...,...,...,...,...,...,...,...
27110858,132484,14,0.000157,0.000618,-0.000018,2.071329e-08,0.000206,0.000000,0.000153
27110859,132484,15,0.000155,0.000618,-0.000018,2.071329e-08,0.000206,0.000000,0.000151
27110860,132484,16,0.000241,0.000618,0.000552,3.106994e-08,0.000155,0.000003,0.000286
27110861,132484,17,0.000162,0.000618,-0.000018,3.106994e-08,0.000206,0.000000,0.000158


In [15]:
node_embedding_df.to_csv(f'{output_root_dir}/node_feature_norm.csv', index=False)

### Resolve trace_id in trace info csv 

In [10]:
import pandas as pd
trace_info = pd.read_csv(f'{output_root_dir}/trace_info.csv')
trace_info

Unnamed: 0,trace_id,trace_bool,error_trace_type,process_idx
0,0,True,normal,0
1,1,True,normal,0
2,2,True,normal,0
3,3,True,normal,0
4,4,True,normal,0
...,...,...,...,...
132480,132480,False,F04-02,8
132481,132481,False,F04-02,8
132482,132482,False,F04-02,8
132483,132483,False,F04-02,8


In [6]:
import json
with open(f'{output_root_dir}/trace_id_to_idx.json', 'r') as fin:
    trace_id_to_idx = json.load(fin)
trace_id_to_idx

{'007efb9578bc4f1ab3eab03cb5188af5.38.16292558869030095': 0,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292559185490209': 1,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292559661930463': 2,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292559997970635': 3,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560476751017': 4,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560535851045': 5,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560561031055': 6,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560682031129': 7,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560768031179': 8,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561172141561': 9,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561219941605': 10,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561605952065': 11,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561764452363': 12,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561905452533': 13,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561982742623': 14,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292562048462765': 15,
 '007efb9578bc4f1a

In [8]:
trace_info['trace_id'] = trace_info['trace_id'].apply(lambda x: trace_id_to_idx[x])
trace_info

Unnamed: 0,trace_id,trace_bool,error_trace_type,process_idx
0,0,True,normal,0
1,1,True,normal,0
2,2,True,normal,0
3,3,True,normal,0
4,4,True,normal,0
...,...,...,...,...
132480,132480,False,F04-02,8
132481,132481,False,F04-02,8
132482,132482,False,F04-02,8
132483,132483,False,F04-02,8


In [9]:
trace_info.to_csv(f'{output_root_dir}/trace_info.csv', index=False)

### Het File Optimise

In [13]:
i = 0
het_file_path = f'{output_root_dir}/het_neigh_list/het_neigh_list_{i}.json'
graph_output_path = f'{output_root_dir}/graph_het_neigh_list'

with open(het_file_path, 'r') as fin:
    _het_neigh_list = json.load(fin)


    # del _het_neigh_list[k]

In [15]:
for gid, het_neigh in tqdm(_het_neigh_list.items()):
    f_output_path = f'{graph_output_path}/g{gid}.json'
    with open(f_output_path, 'w') as fout:
        json.dump(het_neigh, fout)

100%|██████████| 14721/14721 [10:45<00:00, 22.82it/s] 


In [5]:
len(_het_neigh_list.keys()), len(existing_keys)

(14721, 14721)

In [6]:
# import torch

# torch.save(_het_neigh_list, f'{output_root_dir}/het_neigh_list/het_neigh_list_{i}.pt')