In [1]:
import os
import json
import pandas as pd
import re

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

class WorkflowSynthesizer:
    """
    Class for synthesizing a workflow from a dictionary of jobs. 
    The jobs and the workflow are represented as dataframes with time-series data.
    """

    def __init__(self):
        """
        Initialize a new instance of the WorkflowSynthesizer class.
        """
        self.workflow = pd.DataFrame()

    def synthetize(self, data_for_jobs):
        """
        Synthesize a workflow from the provided jobs.

        Args:
            data_for_jobs (dict): A dictionary where keys are job names 
            and values are DataFrames with columns 'timestamp', 'bytesRead', 
            and 'bytesWritten'.
        """
        min_timestamp = min(df['timestamp'].min() for df in data_for_jobs.values())
        max_timestamp = max(df['timestamp'].max() for df in data_for_jobs.values())
        
        # Create a DataFrame with a uniform timestamp range
        synthetic_timestamps = np.arange(min_timestamp, max_timestamp + 1, 5000)
        synthetic_df = pd.DataFrame({'timestamp': synthetic_timestamps})
        
        for job_name, job_df in data_for_jobs.items():
            # Merge with the synthetic DataFrame to get uniform timestamps
            merged_df = pd.merge(synthetic_df, job_df, on='timestamp', how='left').fillna(0)
            
            # Sum up the bytesRead and bytesWritten across all jobs
            if self.workflow.empty:
                self.workflow = merged_df
            else:
                self.workflow['bytesRead'] += merged_df['bytesRead']
                self.workflow['bytesWritten'] += merged_df['bytesWritten']

    def to_dict(self):
        """
        Convert the synthesized workflow to a dictionary format.

        Returns:
            dict: The synthesized workflow in dictionary format.
        """
        output = {}
        output['timestamp'] = self.workflow['timestamp'].tolist()
        output['bytesRead'] = self.workflow['bytesRead'].tolist()
        output['bytesWritten'] = self.workflow['bytesWritten'].tolist()
        return output
    
    def plot_workflow(self):
        """
        Plot the time series of bytesRead and bytesWritten for the synthesized workflow.
        """
        plt.figure(figsize=(14, 6))
        plt.plot(self.workflow['timestamp'], self.workflow['bytesRead'], label='Bytes Read')
        plt.plot(self.workflow['timestamp'], self.workflow['bytesWritten'], label='Bytes Written')
        plt.xlabel('Timestamp')
        plt.ylabel('Bytes')
        plt.title('Bytes Read and Written Over Time')
        plt.legend()
        plt.grid(True)
        plt.show()

# Example usage
data_for_jobs = {
    'job1': pd.DataFrame({'timestamp': [1687960785000, 1687960790000, 1687960795000], 'bytesRead': [10, 20, 30], 'bytesWritten': [5, 15, 25]}),
    'job2': pd.DataFrame({'timestamp': [1687961145000, 1687961150000, 1687961155000], 'bytesRead': [40, 50, 60], 'bytesWritten': [35, 45, 55]})
}

synthesizer = WorkflowSynthesizer()
synthesizer.synthetize(data_for_jobs)
print(synthesizer.workflow)
print(synthesizer.to_dict())
#synthesizer.plot_workflow()


        timestamp  bytesRead  bytesWritten
0   1687960785000       10.0           5.0
1   1687960790000       20.0          15.0
2   1687960795000       30.0          25.0
3   1687960800000        0.0           0.0
4   1687960805000        0.0           0.0
..            ...        ...           ...
70  1687961135000        0.0           0.0
71  1687961140000        0.0           0.0
72  1687961145000       40.0          35.0
73  1687961150000       50.0          45.0
74  1687961155000       60.0          55.0

[75 rows x 3 columns]
{'timestamp': [1687960785000, 1687960790000, 1687960795000, 1687960800000, 1687960805000, 1687960810000, 1687960815000, 1687960820000, 1687960825000, 1687960830000, 1687960835000, 1687960840000, 1687960845000, 1687960850000, 1687960855000, 1687960860000, 1687960865000, 1687960870000, 1687960875000, 1687960880000, 1687960885000, 1687960890000, 1687960895000, 1687960900000, 1687960905000, 1687960910000, 1687960915000, 1687960920000, 1687960925000, 16879609300

In [3]:
def camel_case_to_snake_case(name):
    """
    Convert a string from CamelCase to snake_case.
    
    Parameters:
        name (str): The string in CamelCase.
        
    Returns:
        str: The string in snake_case.
    """
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)).lower()

# Update the get_column_names function to include the renamed columns using the new prefix strategy
def get_column_names(json_file_name):
    """
    Given a JSON file name, this function returns a list of relevant column names.
    
    Parameters:
        json_file_name (str): The name of the JSON file.
        
    Returns:
        list: A list of strings representing column names.
    """
    # Extract the prefix from the file name and convert it to snake_case
    prefix = camel_case_to_snake_case(json_file_name.split('.')[0])
    
    column_name_mapping = {
        "accessPatternRead.json": ["timestamp", f"{prefix}_random", f"{prefix}_sequential", f"{prefix}_stride", f"{prefix}_unclassified"],
        "accessPatternWrite.json": ["timestamp", f"{prefix}_random", f"{prefix}_sequential", f"{prefix}_stride", f"{prefix}_unclassified"],
        "ioSizesRead.json": ["timestamp", f"{prefix}_0B_16B", f"{prefix}_16B_4KB", f"{prefix}_4KB_128KB", f"{prefix}_128KB_1MB", f"{prefix}_1MB_16MB", f"{prefix}_16MB_128MB", f"{prefix}_128MB_+"],
        "ioSizesWrite.json": ["timestamp", f"{prefix}_0B_16B", f"{prefix}_16B_4KB", f"{prefix}_4KB_128KB", f"{prefix}_128KB_1MB", f"{prefix}_1MB_16MB", f"{prefix}_16MB_128MB", f"{prefix}_128MB_+"],
        "operationsCount.json": ["timestamp", f"{prefix}_read", f"{prefix}_write"],
        "volume.json": ["timestamp", "bytesRead", "bytesWritten"]
    }
    
    return column_name_mapping.get(json_file_name, [])

# Test the above function
test_file_names = ["accessPatternRead.json", "ioSizesWrite.json", 
                   "operationsCount.json", "volume.json"]
for name in test_file_names:
    print(f"For {name}, the columns are: {get_column_names(name)}")


For accessPatternRead.json, the columns are: ['timestamp', 'access_pattern_read_random', 'access_pattern_read_sequential', 'access_pattern_read_stride', 'access_pattern_read_unclassified']
For ioSizesWrite.json, the columns are: ['timestamp', 'io_sizes_write_0B_16B', 'io_sizes_write_16B_4KB', 'io_sizes_write_4KB_128KB', 'io_sizes_write_128KB_1MB', 'io_sizes_write_1MB_16MB', 'io_sizes_write_16MB_128MB', 'io_sizes_write_128MB_+']
For operationsCount.json, the columns are: ['timestamp', 'operations_count_read', 'operations_count_write']
For volume.json, the columns are: ['timestamp', 'bytesRead', 'bytesWritten']


In [4]:
import os


def is_file_extension(filename, expected_extension):
    """
    Check if the file has the expected extension.
    
    Parameters:
        filename (str): The name of the file.
        expected_extension (str): The expected file extension (without the dot).
        
    Returns:
        bool: True if the file has the expected extension, False otherwise.
    """
    _, file_extension = os.path.splitext(filename)
    
    return file_extension == f".{expected_extension}"

# Tests
assert is_file_extension("example.json", "json") == True
assert is_file_extension("example.txt", "json") == False
assert is_file_extension("example.json", "txt") == False
assert is_file_extension("example.JSON", "json") == False  # Case-sensitive
assert is_file_extension("example", "json") == False       # No extension

print("All tests passed!")



All tests passed!


In [5]:
# Initialize an empty dictionary to hold the data for each job
data_for_jobs = {}

# Loop through all items in the current directory
for wf_folder in os.listdir():
    # Check if the item is a folder and if its name is numeric (i.e., a job number)
    print(f"----\nProcessing workflow : {wf_folder}...")
    if os.path.isdir(wf_folder):
        # Initialize a dictionary to hold the data for this specific job
        data_for_this_job = {}
        # Loop through all JSON files in this folder
        for job_folder in os.listdir(wf_folder):
            is_folder = os.path.isdir(os.path.join(os.getcwd(), wf_folder, job_folder))
            print(f"Found job_folder: {job_folder}, is_folder:{is_folder}")
            # Ensure that we are dealing with job folder
            if is_folder:
                print(f"job_folder: {job_folder}")
                # Initialize a DataFrame to hold the data for this specific job
                df_for_this_job = pd.DataFrame()
                print(f"    Browsing {job_folder}...")
                for json_file in os.listdir(os.path.join(wf_folder, job_folder)):
                    if is_file_extension(json_file, "json"):
                        print(f"      Processing {json_file}...")
                        # Construct the full path to the JSON file
                        json_file_path = os.path.join(os.getcwd(), wf_folder, job_folder, json_file)
                        
                        # Read the JSON file into a list of lists
                        with open(json_file_path, 'r') as f:
                            json_data = json.load(f)
                        
                        # Create a temporary DataFrame from the JSON data
                        df_temp = pd.DataFrame(json_data, columns=
                                            get_column_names(json_file))
                        
                        # Merge the temporary DataFrame into the DataFrame for this job, based on the 'timestamp' column
                        if df_for_this_job.empty:
                            df_for_this_job = df_temp
                        else:
                            df_for_this_job = pd.merge(df_for_this_job, 
                                                       df_temp, 
                                                       on='timestamp', 
                                                       how='outer')
                            
                    # Save the DataFrame for this job to a CSV file
                    csv_file_path = os.path.join(os.getcwd(), wf_folder, f"{job_folder}.csv")
                    
                    #df_for_this_job.to_csv(csv_file_path, index=False)
                    print(f"            Saving here... {job_folder}.csv")
                    
                    # Add this DataFrame to the dictionary
                    data_for_jobs[job_folder] = df_for_this_job

            else:
                print(f"Skipping {job_folder}...")
# synthesizer = WorkflowSynthesizer()
# synthesizer.synthetize(data_for_jobs)
# print(synthesizer.workflow)
# print(synthesizer.to_dict())
# synthesizer.plot_workflow()


def list_and_classify_directory_contents(directory_path):
    """
    List and classify the contents of a given directory into folders and files.
    
    Parameters:
        directory_path (str): The path to the directory to list.
        
    Returns:
        None: Prints the classification results.
    """
    for item in os.listdir(directory_path):
        item_path = os.path.join(directory_path, item)
        
        if os.path.isdir(item_path):
            print(f"{item} -> Folder")
        elif os.path.isfile(item_path):
            print(f"{item} -> File")
        else:
            print(f"{item} -> Unknown")

# Usage example
# Replace 'your_directory_path_here' with the path of the directory you want to list and classify.
# Uncomment the line below to run the function.
# list_and_classify_directory_contents('your_directory_path_here')




----
Processing workflow : ECMWF-649c3c40cc9340246f87cb58...
Found job_folder: 371912, is_folder:True
job_folder: 371912
    Browsing 371912...
      Processing ioSizesWrite.json...
            Saving here... 371912.csv
      Processing volume.json...
            Saving here... 371912.csv
      Processing ioSizesRead.json...
            Saving here... 371912.csv
      Processing operationsCount.json...
            Saving here... 371912.csv
      Processing accessPatternWrite.json...
            Saving here... 371912.csv
      Processing accessPatternRead.json...
            Saving here... 371912.csv
Found job_folder: 371913, is_folder:True
job_folder: 371913
    Browsing 371913...
      Processing ioSizesWrite.json...
            Saving here... 371913.csv
      Processing volume.json...
            Saving here... 371913.csv
      Processing ioSizesRead.json...
            Saving here... 371913.csv
      Processing operationsCount.json...
            Saving here... 371913.csv
      Proc

In [6]:
import os
import unittest
import requests
import urllib3
from app_decomposer.utils_demo import *

from loguru import logger
from unittest.mock import patch
from pprint import pprint
from os.path import dirname, abspath
import warnings
warnings.filterwarnings('ignore')
from performance_data.data_table import PhaseData, DataTable
from performance_data import DATASET_FILE
from app_decomposer import DEFAULT_CONFIGURATION, KIWI_CONFIG, CURRENT_DIR, API_DICT_TS, IOI_SAMPLING_PERIOD, DATASET_SOURCE
from app_decomposer.api_connector import request_delegator
from app_decomposer.config_parser import Configuration
from app_decomposer.api_connector import TimeSeries
from cluster_simulator.analytics import *
from loguru import logger
import simpy
from loguru import logger
import time
import numpy as np
import pandas as pd
from cluster_simulator.utils import convex_hull
from cluster_simulator.cluster import Cluster, Tier, EphemeralTier, bandwidth_share_model, compute_share_model, get_tier, convert_size
from cluster_simulator.phase import DelayPhase, ComputePhase, IOPhase
from cluster_simulator.application import Application
from cluster_simulator.analytics import display_run
from cluster_simulator.ephemeral_placement import ClusterBlackBox

In [7]:
'''
The function get_job_timeseries_from_file takes a workflow folder (wf_folder) and a job ID (job_id) as input parameters. It returns a dictionary containing various time series arrays for the specified job.

In this function, the job folder path is constructed using os.path.join and checked for existence. If the folder doesn't exist, a FileNotFoundError is raised. The function then initializes a dictionary (timeseries) to hold the time series data.

It iterates over all JSON files in the job folder, reading each file into a Pandas DataFrame and removing duplicate timestamps. It then populates the timeseries dictionary with the cleaned data.

To generate a CSV file with the specified columns, you'll need to first map the columns from the individual DataFrames to the target columns in the CSV file. Based on the dictionary structures you provided, here's how I interpret the mapping:

timestamp: Directly taken from all the entries (common).
bytesRead: From the volume entry.
bytesWritten: From the volume entry.
operationRead: From the operationsCount entry, specifically the operations_count_read field.
operationWrite: From the operationsCount entry, specifically the operations_count_write field.
accessRandRead: From the accessPatternRead entry, specifically the access_pattern_read_random field.
accessSeqRead: From the accessPatternRead entry, specifically the access_pattern_read_sequential field.
accessStrRead: From the accessPatternRead entry, specifically the access_pattern_read_stride field.
accessUnclRead: From the accessPatternRead entry, specifically the access_pattern_read_unclassified field.
accessRandWrite: From the accessPatternWrite entry, specifically the access_pattern_write_random field.
accessSeqWrite: From the accessPatternWrite entry, specifically the access_pattern_write_sequential field.
accessStrWrite: From the accessPatternWrite entry, specifically the access_pattern_write_stride field.
accessUnclWrite: From the accessPatternWrite entry, specifically the access_pattern_write_unclassified field.

'''
import os
import pandas as pd
import numpy as np
import random
import os
import pandas as pd
import numpy as np

def get_job_timeseries_from_json_file(wf_folder, job_id, skip_columns=[]):
    """
    Method to extract time series data for a specific job within a workflow folder.
    
    Parameters:
        wf_folder (str): The workflow folder containing job data.
        job_id (str): The specific job ID to extract time series data for.
        skip_columns (list): List of columns to be skipped.
    
    Returns:
        job_timeseries (dict): with 'volume', 'operationsCount' and 'accesspattern' as keys.
    """
    # Construct the path to the job folder
    job_folder_path = os.path.join(wf_folder, str(job_id))
    
    if not os.path.exists(job_folder_path):
        raise FileNotFoundError(f"Job folder for job_id {job_id} not found in {wf_folder}")

    # Initialize a dictionary to hold time series data
    job_timeseries = {}

    # Mapping of column names to final output names
    column_name_map = {
        "bytesRead": "bytesRead",
        "bytesWritten": "bytesWritten",
        "operations_count_read": "operationRead",
        "operations_count_write": "operationWrite",
        "access_pattern_read_random": "accessRandRead",
        "access_pattern_read_sequential": "accessSeqRead",
        "access_pattern_read_stride": "accessStrRead",
        "access_pattern_read_unclassified": "accessUnclRead",
        "access_pattern_write_random": "accessRandWrite",
        "access_pattern_write_sequential": "accessSeqWrite",
        "access_pattern_write_stride": "accessStrWrite",
        "access_pattern_write_unclassified": "accessUnclWrite",
    }

    # Mapping of JSON file names to output dictionary keys
    json_key_map = {
        "volume.json": "volume",
        "operationsCount.json": "operationsCount",
        "accessPatternRead.json": "accessPattern",
        "accessPatternWrite.json": "accessPattern",
        "ioSizesRead.json": "ioSizes",
        "ioSizesWrite.json": "ioSizes"
    }

    # Loop through all JSON files in this folder
    for json_file in os.listdir(job_folder_path):
        # Full path to the JSON file
        json_file_path = os.path.join(job_folder_path, json_file)
        
        # Read the JSON file into a DataFrame
        df = pd.read_json(json_file_path)
        
        # Get column names based on the json_file
        column_names = get_column_names(json_file)
        df.columns = column_names

        # Remove duplicate timestamps, if any
        df_clean = df.drop_duplicates(subset=['timestamp'])

        # Determine the dictionary key corresponding to this JSON file
        key = json_key_map.get(json_file)

        # Initialize a sub-dictionary for this type of time series
        if key not in job_timeseries:
            job_timeseries[key] = {}

        # Rename columns to their final output names
        for column in df_clean.columns:
            new_col_name = column_name_map.get(column, column)
            if new_col_name not in skip_columns:
                job_timeseries[key][new_col_name] = df_clean[column].to_numpy()

    return job_timeseries



# Example usage:
wf_folder = "/home_nfs/mimounis/iosea-wp3-recommandation-system/dataset_generation/dataset_deep/ECMWF-649c3c40cc9340246f87cb58"
job_id = "371902"
read_names = get_column_names("ioSizesRead.json")
read_names.remove("timestamp")
write_names = get_column_names("ioSizesWrite.json")
write_names.remove("timestamp")
skip_columns = read_names + write_names

job_timeseries = get_job_timeseries_from_json_file(wf_folder, job_id,
                                                   skip_columns=skip_columns)

for ts_key in list(job_timeseries.keys()):
    print(f"\n{ts_key} entry: ")
    print(job_timeseries[ts_key])



ioSizes entry: 
{'timestamp': array([1687960690000, 1687960695000, 1687960700000, 1687960705000,
       1687960710000, 1687960715000, 1687960720000, 1687960725000,
       1687960730000, 1687960735000, 1687960740000, 1687960745000,
       1687960750000, 1687960755000, 1687960760000, 1687960765000,
       1687960770000, 1687960775000])}

volume entry: 
{'timestamp': array([1687960690000, 1687960695000, 1687960700000, 1687960705000,
       1687960710000, 1687960715000, 1687960720000, 1687960725000,
       1687960730000, 1687960735000, 1687960740000, 1687960745000,
       1687960750000, 1687960755000, 1687960760000, 1687960765000,
       1687960770000, 1687960775000]), 'bytesRead': array([832678,      0,      0,      0,      0,  35936,    218,   3546,
          158,    158,    158,    158,    158,      0,    316,    158,
            0,    158]), 'bytesWritten': array([        0,         0,         0,         0,         0,    131072,
       872448408, 707461608, 791280104, 770243048, 75982

In [26]:
# Initialize an empty dictionary to hold the data for each job
data_for_jobs = {}

# Loop through all items in the current directory
for wf_folder in os.listdir():
    # Check if the item is a folder and if its name is numeric (i.e., a job number)
    print(f"----\nProcessing workflow : {wf_folder}...")
    if os.path.isdir(wf_folder):
        # Initialize a dictionary to hold the data for this specific job
        data_for_this_job = {}
        # Loop through all JSON files in this folder
        for job_folder in os.listdir(wf_folder):
            is_folder = os.path.isdir(os.path.join(os.getcwd(), wf_folder, job_folder))
            print(f"Found job_folder: {job_folder}, is_folder:{is_folder}")
            # Ensure that we are dealing with job folder
            if is_folder:
                print(f"job_folder: {job_folder}")
                # Initialize a DataFrame to hold the data for this specific job
                df_for_this_job = pd.DataFrame()
                print(f"    Browsing {job_folder}...")
                for json_file in os.listdir(os.path.join(wf_folder, job_folder)):
                    if is_file_extension(json_file, "json"):
                        print(f"      Processing {json_file}...")
                        # Construct the full path to the JSON file
                        json_file_path = os.path.join(os.getcwd(), wf_folder, job_folder, json_file)
                        
                        # Read the JSON file into a list of lists
                        with open(json_file_path, 'r') as f:
                            json_data = json.load(f)
                        
                        # Create a temporary DataFrame from the JSON data
                        df_temp = pd.DataFrame(json_data, columns=
                                            get_column_names(json_file))
                        
                        # Merge the temporary DataFrame into the DataFrame for this job, based on the 'timestamp' column
                        if df_for_this_job.empty:
                            df_for_this_job = df_temp
                        else:
                            df_for_this_job = pd.merge(df_for_this_job, 
                                                       df_temp, 
                                                       on='timestamp', 
                                                       how='outer')
                            
                    # Save the DataFrame for this job to a CSV file
                    csv_file_path = os.path.join(os.getcwd(), wf_folder, f"{job_folder}.csv")
                    
                    #df_for_this_job.to_csv(csv_file_path, index=False)
                    print(f"            Saving here... {job_folder}.csv")
                    
                    # Add this DataFrame to the dictionary
                    data_for_jobs[job_folder] = df_for_this_job
                    
                # Gather timeseries data for this job
                print(f"---shape of df : {df_for_this_job.shape[0]}")
                if df_for_this_job.shape[0] > 1:
                    representation, phase_features = decompose_ioi_job(wf_folder, job_folder)
                    print(representation)
                    print(phase_features)

            else:
                print(f"Skipping {job_folder}...")

----
Processing workflow : ECMWF-649c3c40cc9340246f87cb58...
Found job_folder: 371912, is_folder:True
job_folder: 371912
    Browsing 371912...
      Processing ioSizesWrite.json...
            Saving here... 371912.csv
      Processing volume.json...
            Saving here... 371912.csv
      Processing ioSizesRead.json...
            Saving here... 371912.csv
      Processing operationsCount.json...
            Saving here... 371912.csv
      Processing accessPatternWrite.json...
            Saving here... 371912.csv
      Processing accessPatternRead.json...
            Saving here... 371912.csv
---shape of df : 722
{'node_count': 1, 'events': [0, 1], 'read_volumes': [0, 3943956138], 'read_bw': [0, 5531495.284712482], 'write_volumes': [0, 66647259], 'write_bw': [0, 93474.41654978962], 'read_pattern': ['Uncl', 'Uncl'], 'write_pattern': ['Uncl', 'Seq'], 'read_operations': [0, 4813], 'write_operations': [0, 411405]}
[{'job_id': '371912', 'nodes': 1, 'read_volume': 0, 'write_volume': 0

In [20]:
# Example usage:
wf_folder = "/home_nfs/mimounis/iosea-wp3-recommandation-system/dataset_generation/dataset_deep/ECMWF-649c3c40cc9340246f87cb58"
job_id = "371902"
job_timeseries = get_job_timeseries_from_json_file(wf_folder, job_id)

#%%capture
logger.remove()
def decompose_ioi_job(wf_folder, job_id):
    with patch.object(ComplexDecomposer, 'get_job_timeseries') as mock_get_timeseries:
        with patch.object(Configuration, 'get_kc_token') as mock_get_kc_token:
            with patch.object(ComplexDecomposer, 'get_job_node_count') as mock_get_node_count:
                mock_get_timeseries.return_value = get_job_timeseries_from_json_file(wf_folder, job_id)
                mock_get_kc_token.return_value = 'token'
                mock_get_node_count.return_value = 1
                # init the job decomposer
                #cd = ComplexDecomposer(v0_threshold=0.02)
                cd = ComplexDecomposer(v0_threshold=0.01)
                representation = cd.get_job_representation(merge_clusters=True)
                phase_features = cd.get_phases_features(representation, job_id=job_id)
                return representation, phase_features
# Launch decomposition on the signal
representation, phase_features = decompose_ioi_job(wf_folder, job_id)
# Showing representation
compute, reads, read_bw, writes, write_bw = representation["events"], representation["read_volumes"], representation["read_bw"], representation["write_volumes"], representation["write_bw"]

print(phase_features)

[{'job_id': '371902', 'nodes': 1, 'read_volume': 868614, 'write_volume': 0, 'read_io_pattern': 'uncl', 'write_io_pattern': 'uncl', 'read_io_size': 39482.454545454544, 'write_io_size': 0, 'ioi_bw': 86861.4}, {'job_id': '371902', 'nodes': 1, 'read_volume': 0, 'write_volume': 9034495302, 'read_io_pattern': 'uncl', 'write_io_pattern': 'str', 'read_io_size': 0, 'write_io_size': 9181397.664634146, 'ioi_bw': 164263550.94545454}, {'job_id': '371902', 'nodes': 1, 'read_volume': 0, 'write_volume': 0, 'read_io_pattern': 'uncl', 'write_io_pattern': 'uncl', 'read_io_size': 0, 'write_io_size': 0, 'ioi_bw': 0.0}]


In [9]:
# Normalize signals to seconds and MB
timestamps = (cd.timestamps.flatten() - cd.timestamps.flatten()[0])/5
original_read =  cd.read_signal.flatten()/1e6
original_write = cd.write_signal.flatten()/1e6

read_bw_scaled = list(map(lambda x: x/1e6, read_bw))
write_bw_scaled = list(map(lambda x: x/1e6, write_bw))

env = simpy.Environment()
nvram_bandwidth = {'read':  {'seq': 780, 'rand': 760},
                    'write': {'seq': 515, 'rand': 505}}
ssd_bandwidth = {'read':  {'seq': 1, 'rand': 1},
                    'write': {'seq': 1, 'rand': 1}}

ssd_tier = Tier(env, 'SSD', max_bandwidth=ssd_bandwidth, capacity=200e9)
nvram_tier = Tier(env, 'NVRAM', max_bandwidth=nvram_bandwidth, capacity=80e9)
data = simpy.Store(env)
cluster = Cluster(env,  compute_nodes=1, cores_per_node=2,
                    tiers=[ssd_tier, nvram_tier])
app = Application(env, name=f"job#{job_id}",
                    compute=compute,
                    read=reads,
                    write=writes,
                    data=data,
                    read_bw=read_bw_scaled,
                    write_bw=write_bw_scaled)
env.process(app.run(cluster, placement=[0]*(10*len(compute))))
env.run()

output = get_execution_signal_3(data, nbr_points=len(timestamps))
sim_time = np.array(output[app.name]["time"])
sim_read_bw = np.array(output[app.name]["read_bw"])
sim_write_bw = np.array(output[app.name]["write_bw"])
print(len(sim_time))
print(len(timestamps))

[{'job_id': '371902', 'nodes': 1, 'read_volume': 868614, 'write_volume': 0, 'read_io_pattern': 'uncl', 'write_io_pattern': 'uncl', 'read_io_size': 39482.454545454544, 'write_io_size': 0, 'ioi_bw': 86861.4}, {'job_id': '371902', 'nodes': 1, 'read_volume': 0, 'write_volume': 9034495302, 'read_io_pattern': 'uncl', 'write_io_pattern': 'str', 'read_io_size': 0, 'write_io_size': 9181397.664634146, 'ioi_bw': 164263550.94545454}, {'job_id': '371902', 'nodes': 1, 'read_volume': 0, 'write_volume': 0, 'read_io_pattern': 'uncl', 'write_io_pattern': 'uncl', 'read_io_size': 0, 'write_io_size': 0, 'ioi_bw': 0.0}]


In [19]:
plot_detected_phases_compare(job_id, merge=True, show_phases=False, 
                             ts=(timestamps, original_write, sim_write_bw), 
                             width=800, height=500)

In [4]:
import json
import os
from collections import defaultdict

# Define the path to the dataset folder
dataset_folder = './dataset_generation/dataset_deep/'

# Initialize dictionary to hold job data
workflow_data = defaultdict(list)

# Step 1: Load Data
for root, dirs, files in os.walk(dataset_folder):
    for filename in files:
        if filename == 'volume.json':
            filepath = os.path.join(root, filename)
            print(filepath)
            with open(filepath, 'r') as f:
                volume_data = json.load(f)
            
            # Extract the job start and end timestamps
            job_id = root.split('/')[-2]  # Assuming job ID is the parent folder name
            start_time = volume_data[0][0]
            end_time = volume_data[-1][0]
            
            # Store data in the dictionary
            workflow_data[job_id] = {'start_time': start_time, 'end_time': end_time}

# Step 2: Sort Jobs by Timestamp
sorted_jobs = sorted(workflow_data.items(), key=lambda x: x[1]['start_time'])


In [8]:
import os
import json
from collections import defaultdict
import pandas as pd

# Function to check file extension
def is_file_extension(filename, extension):
    return filename.endswith(f'.{extension}')

# Initialize dictionary to hold job data
workflow_data = defaultdict(list)

# Default workflow folder
wf_folder = "/home_nfs/mimounis/iosea-wp3-recommandation-system/dataset_generation/dataset_deep/ECMWF-649c3c40cc9340246f87cb58"

# Loop through all job folders in the workflow folder
for job_folder in os.listdir(wf_folder):
    #print(f"----\nProcessing job : {job_folder}...")
    
    job_folder_path = os.path.join(wf_folder, job_folder)
    
    # Ensure that we are dealing with job folder
    if os.path.isdir(job_folder_path):
        json_file_path = os.path.join(job_folder_path, 'volume.json')
        
        # Check if volume.json exists in the job folder
        if os.path.exists(json_file_path):
            #print(f"      Processing volume.json for job {job_folder}...")
            
            # Read the JSON file into a list of lists
            with open(json_file_path, 'r') as f:
                volume_data = json.load(f)
            
            # Extract the job start and end timestamps
            start_time = volume_data[0][0]
            end_time = volume_data[-1][0]
            
            # Store data in the dictionary
            workflow_data[job_folder] = {'start_time': start_time, 'end_time': end_time}

# Calculate Delays and Check for Overlaps
previous_end_time = 0
for i, (job_id, job_data) in enumerate(sorted_jobs):
    delay = job_data['start_time'] - previous_end_time
    print(f"Delay between Job {sorted_jobs[i-1][0]} and Job {job_id}: {delay} ms")
    
    if i > 0 and job_data['start_time'] < sorted_jobs[i-1][1]['end_time']:
        print(f"Job {job_id} and Job {sorted_jobs[i-1][0]} have overlapping time spans.")
    
    previous_end_time = job_data['end_time']

    if delay < 5000:  # 5 seconds
        print(f"Guessing 'afterok' dependency between Job {sorted_jobs[i-1][0]} and Job {job_id}")

print(workflow_data)

defaultdict(<class 'list'>, {'371912': {'start_time': 1687960785000, 'end_time': 1687964390000}, '371913': {'start_time': 1687960790000, 'end_time': 1687964390000}, '371911': {'start_time': 1687960780000, 'end_time': 1687964390000}, '371906': {'start_time': 1687960695000, 'end_time': 1687960775000}, '371924': {'start_time': 1687964500000, 'end_time': 1687964500000}, '371910': {'start_time': 1687960775000, 'end_time': 1687964390000}, '371903': {'start_time': 1687960690000, 'end_time': 1687960775000}, '371902': {'start_time': 1687960690000, 'end_time': 1687960775000}, '371900': {'start_time': 1687960640000, 'end_time': 1687960640000}, '371908': {'start_time': 1687960765000, 'end_time': 1687964360000}, '371905': {'start_time': 1687960690000, 'end_time': 1687960775000}, '371907': {'start_time': 1687960755000, 'end_time': 1687964360000}, '371904': {'start_time': 1687960690000, 'end_time': 1687960775000}, '371909': {'start_time': 1687960770000, 'end_time': 1687964390000}})


In [18]:
import json

# Example sorted_jobs
sorted_jobs = sorted(
    {
        '371912': {'start_time': 1687960785000, 'end_time': 1687964390000},
        '371913': {'start_time': 1687960790000, 'end_time': 1687964390000},
        '371911': {'start_time': 1687960780000, 'end_time': 1687964390000},
        '371906': {'start_time': 1687960695000, 'end_time': 1687960775000},
        '371924': {'start_time': 1687964500000, 'end_time': 1687964500000},
        '371910': {'start_time': 1687960775000, 'end_time': 1687964390000},
        '371903': {'start_time': 1687960690000, 'end_time': 1687960775000},
        '371902': {'start_time': 1687960690000, 'end_time': 1687960775000},
        '371900': {'start_time': 1687960640000, 'end_time': 1687960640000},
        '371908': {'start_time': 1687960765000, 'end_time': 1687964360000},
        '371905': {'start_time': 1687960690000, 'end_time': 1687960775000},
        '371907': {'start_time': 1687960755000, 'end_time': 1687964360000},
        '371904': {'start_time': 1687960690000, 'end_time': 1687960775000},
        '371909': {'start_time': 1687960770000, 'end_time': 1687964390000}
    }.items(),
    key=lambda x: x[1]['start_time']
)

# Initialize dependency dictionaries
dependencies = {
    'sequential': [],
    'parallel': [],
    'after': [],
    'delay': []
}

# Threshold for 'after' dependency in milliseconds
threshold = 1000  # 1 second
# sorted_jobs = workflow_data
# Loop through sorted jobs
for i in range(len(sorted_jobs)):
    current_job_id, current_job_data = sorted_jobs[i]
    current_start, current_end = current_job_data['start_time'], current_job_data['end_time']
    
    for j in range(i+1, len(sorted_jobs)):
        next_job_id, next_job_data = sorted_jobs[j]
        next_start, next_end = next_job_data['start_time'], next_job_data['end_time']
        
        # Check for Sequential dependency
        if current_end <= next_start:
            dependencies['sequential'].append([current_job_id, next_job_id])
        
        # Check for Parallel dependency
        if current_start < next_end and current_end > next_start:
            dependencies['parallel'].append([current_job_id, next_job_id])
        
        # Check for 'After' dependency
        if next_start - current_end <= threshold:
            dependencies['after'].append([current_job_id, next_job_id])
        
        # Check for 'Delay' dependency (here, we specify an example delay of 5 seconds)
        if 5000 <= next_start - current_end <= 5000 + threshold:
            dependencies['delay'].append([current_job_id, next_job_id, 5000])

print(dependencies["delay"])

[['371903', '371911', 5000], ['371902', '371911', 5000], ['371905', '371911', 5000], ['371904', '371911', 5000], ['371906', '371911', 5000]]


In [21]:
print((sorted_jobs))

[('371900', {'start_time': 1687960640000, 'end_time': 1687960640000}), ('371903', {'start_time': 1687960690000, 'end_time': 1687960775000}), ('371902', {'start_time': 1687960690000, 'end_time': 1687960775000}), ('371905', {'start_time': 1687960690000, 'end_time': 1687960775000}), ('371904', {'start_time': 1687960690000, 'end_time': 1687960775000}), ('371906', {'start_time': 1687960695000, 'end_time': 1687960775000}), ('371907', {'start_time': 1687960755000, 'end_time': 1687964360000}), ('371908', {'start_time': 1687960765000, 'end_time': 1687964360000}), ('371909', {'start_time': 1687960770000, 'end_time': 1687964390000}), ('371910', {'start_time': 1687960775000, 'end_time': 1687964390000}), ('371911', {'start_time': 1687960780000, 'end_time': 1687964390000}), ('371912', {'start_time': 1687960785000, 'end_time': 1687964390000}), ('371913', {'start_time': 1687960790000, 'end_time': 1687964390000}), ('371924', {'start_time': 1687964500000, 'end_time': 1687964500000})]


Correlation

In [4]:
import pandas as pd
data = {    
    '371912': {'start_time': 1687960785000, 'end_time': 1687964390000},
    '371913': {'start_time': 1687960790000, 'end_time': 1687964390000},
    '371911': {'start_time': 1687960780000, 'end_time': 1687964390000},
    '371906': {'start_time': 1687960695000, 'end_time': 1687960775000},
    '371924': {'start_time': 1687964500000, 'end_time': 1687964500000},
    '371910': {'start_time': 1687960775000, 'end_time': 1687964390000},
    '371903': {'start_time': 1687960690000, 'end_time': 1687960775000},
    '371902': {'start_time': 1687960690000, 'end_time': 1687960775000},
    '371900': {'start_time': 1687960640000, 'end_time': 1687960640000},
    '371908': {'start_time': 1687960765000, 'end_time': 1687964360000},
    '371905': {'start_time': 1687960690000, 'end_time': 1687960775000},
    '371907': {'start_time': 1687960755000, 'end_time': 1687964360000},
    '371904': {'start_time': 1687960690000, 'end_time': 1687960775000},
    '371909': {'start_time': 1687960770000, 'end_time': 1687964390000}
}

# Convert to DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Compute the correlation matrix
correlation_matrix = df.corr()

print(correlation_matrix)


            start_time  end_time
start_time    1.000000  0.299907
end_time      0.299907  1.000000
