In [1]:
import json
import networkx as nx
import pandas as pd
import glob
import os
from math import floor
from tqdm import tqdm

In [2]:
cwd = os.getcwd()
# trace types
traces = ["1000genome", "cycles", "soykb"]


## SET GLOBAL PARAMETERS

In [3]:
TRAIN_RATIO = 0.8
SAVE_DIRECTORY = os.path.join(cwd,"real_life_traces") + "/"

## Extract the graphs from that.

In [5]:
def convert_to_graph(file_name):
    # Read in data
    with open(file_name) as json_file:
        data = json.load(json_file)

    # data name to actual node name
    id_mapper = {}
    count = 0
    g = nx.DiGraph()

    # Loop over every node
    for node in data['workflow']['jobs']:
        node_name = count 
        id_mapper[node['name']] = node_name
        count += 1
        # Add node (nx handles duplicates)
        g.add_node(node_name)

        # Check if parent is there
        for parent in node['parents']:
            if parent in id_mapper.keys():
                parent_name = id_mapper[parent]
            else:
                parent_name = count
                count += 1
                id_mapper[parent] = parent_name

            # make edges (nx handles duplicates)
            g.add_edge(parent_name, node_name)

    # Sanity check
    assert nx.algorithms.is_directed_acyclic_graph(g)
    return g


def save_graph_data(filename, graphs):
    input_df = pd.DataFrame(columns = [
            "graph_object", 
        ])
    for graph in graphs:
        entry_dict = {
            "graph_object": nx.node_link_data(graph), # save graph object in json form
        }
        input_df = input_df.append(entry_dict, ignore_index = True)

    input_df.to_csv(filename, index=False)


In [5]:
for trace in tqdm(traces):
    workflow_folder = os.path.join(cwd, "pegasus-traces-master")
    workflow_folder = os.path.join(workflow_folder, trace)
    workflow_folder = os.path.join(workflow_folder, "chameleon-cloud")

    # Get all the files
    files = glob.glob(workflow_folder + "/*.json")
    
    # Get all the graphs
    graphs = [convert_to_graph(f) for f in files]

    graphs = [graph for graph in graphs if graph.number_of_nodes()<=500]

    # Split the graphs
    split_point = floor(len(graphs) * TRAIN_RATIO)

    train_file_name = f'{SAVE_DIRECTORY}{trace}_train.csv'
    test_file_name = f'{SAVE_DIRECTORY}{trace}_test.csv'

    # save the graphs
    save_graph_data(train_file_name, graphs[:split_point])
    save_graph_data(test_file_name, graphs[split_point:])
    
    
    


100%|██████████| 3/3 [00:01<00:00,  2.07it/s]


In [6]:
traces2 = ["1000genome","epigenomics", "montage", "seismology", "cycles"]

In [7]:
workflow_folder = os.path.join(cwd, "pegasus-traces-master")
genome_folder = os.path.join(workflow_folder, traces2[4])
genome_folder = os.path.join(genome_folder, "chameleon-cloud")

epigenomics_folder = os.path.join(workflow_folder, traces2[1])
epigenomics_folder = os.path.join(epigenomics_folder, "chameleon-cloud")

montage_folder = os.path.join(workflow_folder, traces2[2])
montage_folder = os.path.join(montage_folder, "chameleon-cloud")

seismology_folder = os.path.join(workflow_folder, traces2[3])
seismology_folder = os.path.join(seismology_folder, "chameleon-cloud")

cycles_folder = os.path.join(workflow_folder, traces2[4])
cycles_folder = os.path.join(seismology_folder, "chameleon-cloud")

# Get all the files
files = glob.glob(genome_folder + "/*.json")+glob.glob(epigenomics_folder + "/*.json") + glob.glob(montage_folder + "/*.json") + glob.glob(seismology_folder + "/*.json") + glob.glob(cycles_folder + "/*.json")
    
# Get all the graphs
graphs = [convert_to_graph(f) for f in files]

graphs = [graph for graph in graphs if graph.number_of_nodes()<=500]

# Split the graphs
split_point = floor(len(graphs) * TRAIN_RATIO)

train_file_name = f'{SAVE_DIRECTORY}big_merged_train.csv'
test_file_name = f'{SAVE_DIRECTORY}big_merged_test.csv'

# save the graphs
save_graph_data(train_file_name, graphs[:split_point])
save_graph_data(test_file_name, graphs[split_point:])
    