In [1]:
import pandas as pd
import plotly.express as px
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import json

In [2]:
graph_data_path_root = '../GraphData'
output_root_dir = '../HetGNN/ProcessedData_rw'

In [3]:
with open(f'../HetGNN/ProcessedData/trace_id_idx.json') as fin:
    trace_id_idx = json.loads(fin.read())
trace_id_idx

{'007efb9578bc4f1ab3eab03cb5188af5.38.16292558869030095': 0,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292559185490209': 1,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292559661930463': 2,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292559997970635': 3,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560476751017': 4,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560535851045': 5,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560561031055': 6,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560682031129': 7,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292560768031179': 8,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561172141561': 9,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561219941605': 10,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561605952065': 11,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561764452363': 12,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561905452533': 13,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292561982742623': 14,
 '007efb9578bc4f1ab3eab03cb5188af5.38.16292562048462765': 15,
 '007efb9578bc4f1a

In [4]:
trace_info = defaultdict(list)

for idx in range(0, 9):
    with open(f'{graph_data_path_root}/process{idx}.jsons', 'r') as fin:
        for line in tqdm(fin.readlines()):
            trace = json.loads(line)
            trace_info['trace_id'].append(trace_id_idx[trace['trace_id']])
            trace_info['trace_bool'].append(trace['trace_bool'])
            trace_info['error_trace_type'].append(trace['error_trace_type'])
            trace_info['process_idx'].append(idx)
    break

trace_info_df = pd.DataFrame(trace_info)
trace_info_df

100%|██████████| 14721/14721 [00:04<00:00, 3278.34it/s]


Unnamed: 0,trace_id,trace_bool,error_trace_type,process_idx
0,0,True,normal,0
1,1,True,normal,0
2,2,True,normal,0
3,3,True,normal,0
4,4,True,normal,0
...,...,...,...,...
14716,14716,False,F04-02,0
14717,14717,False,F04-02,0
14718,14718,False,F04-02,0
14719,14719,False,F04-02,0


In [5]:
import numpy as np
import random

In [6]:
n_nodes = 1000
n_neighbours = 20

In [7]:
# graph = np.zeros((n_nodes, n_neighbours)) -1
# graph

In [98]:
import json

for idx in range(0, 9):
    with open(f'{graph_data_path_root}/process{idx}.jsons', 'r') as fin:
        for line in tqdm(fin.readlines()):
            
            graph = defaultdict(set)
            node_type_map = {}
            trace = json.loads(line)
            node_info = trace['node_info']
            gid = trace_id_idx[trace['trace_id']]
            for (src_id, dst_id), edge_type in zip(trace['edge_index'], trace['edge_attr']):
                dst_type = node_info[dst_id][4]
                src_type = node_info[src_id][4]
                graph[src_id].add((dst_id, dst_type))
                node_type_map[dst_id] = dst_type
                node_type_map[src_id] = src_type
            
            write_relation_list_from_graph(gid, graph)

  0%|          | 20/14721 [00:07<1:35:43,  2.56it/s]


KeyboardInterrupt: 

In [97]:
def write_relation_list_from_graph(gid, graph):
    """
    write to graph from raw graph edges
    """
    relation_list = {}
    # graph_neighbour_type_distribution(graph)
    # print(f'generating randomwalk for graph {gid}')
    walks_ = random_walk(graph)

    for src, neigh_list in walks_.items():
        src_type = node_type_map[src]
        for neigh in neigh_list:
            neigh_type = node_type_map[neigh]
            
            relation_f = f'{src_type}_{neigh_type}'

            # init dict if not exists
            if relation_f not in relation_list.keys():
                relation_list[relation_f] = defaultdict(list)

            relation_list[relation_f][src].append(neigh)
    write_relation_list(gid, relation_list)


def write_relation_list(gid, relation_list):
    """
    write to relation file
    """
    # print(f'writing relation file for graph {gid}')
    for relation_f, neigh_list in relation_list.items():
        with open(f'{output_root_dir}/{relation_f}_list.txt', 'a') as fout:
            for src, neighbours in neigh_list.items():
                neigh_str = ','.join([str(x) for x in neighbours])
                write_line = f'{gid}:{src}:{neigh_str}'
                fout.write(f'{write_line}\n')
    # print(f'Saved graph {gid}')


def graph_neighbour_type_distribution(graph, scale=100):
    """
    get the distribution of different neighbour types
    """
    neighbour_type_count = defaultdict(int)
    for src, dst in graph.items():
        for _, dst_type in dst:
            neighbour_type_count[dst_type] += 1
    
    sum_ = sum([v for _, v in neighbour_type_count.items()])

    for k in neighbour_type_count.keys():
        neighbour_type_count[k] = int(100 * neighbour_type_count[k] / sum_)
    
    return neighbour_type_count


def random_walk(graph, walk_size=100, walk_type='even'):
    """
    Randomw walk path from a graph
    using every node as starting node
    default size of the walk is 100
    do not allow cyclic walks, restarts on cyclic walks
    """

    neighbour_node_size_limit = graph_neighbour_type_distribution(graph)
    neighbour_node_size = defaultdict(int)

    graph_walks = {}

    for src_node in graph.keys():
        if len(graph[src_node]) < 1:
            continue

        current_node = src_node
        walk_size = 0
        walks = []
        current_walk = []
        while walk_size < 100:
            # print(walk_size)
            if current_node in graph.keys():
                neigh_node, neigh_node_type = random.choice(list(graph[current_node]))
            else:
                # print("reached the end of the walk, restart.")
                walks.extend(current_walk)
                current_walk = []
                current_node = src_node
                continue

            # print((neigh_node, neigh_node_type))
            
            # restart the walk when cyclic
            if neigh_node in current_walk or neigh_node == current_node:
                walks.extend(current_walk)
                current_walk = []
                current_node = src_node
                print('graph cycled, restart')
                continue

            current_walk.append(neigh_node)
            walk_size += 1
            neighbour_node_size[neigh_node_type] += 1
            current_node = neigh_node
        walks.extend(current_walk)
        graph_walks[src_node] = walks
    
    for k in graph_walks.keys():
        graph_walks[k] = np.unique(np.array(graph_walks[k]).flatten())

    return graph_walks
    
