### General Imports

In [1]:
import sys
sys.path.append("../")
import os
from app_decomposer.connector import Connector, APIConnector, ApiModel
from app_decomposer.config_parser import Configuration
import unittest
import numpy as np
import pandas as pd
from unittest.mock import MagicMock
from app_decomposer.central_job import WorkflowSynthesizer, CentralJob, WorkflowSearcher,display_features, display_timeseries

CONFIG_FILE = os.path.join(os.path.dirname(os.getcwd()), "tests_integration",
                           "test_data", "test_docker_config.yaml")

### IOI Connection

In [2]:
config = Configuration(path=CONFIG_FILE)
keycloak_token = config.get_kc_token_docker()
connector = APIConnector(api_uri="http://localhost/pybackend/",
                              api_token=f"Bearer {keycloak_token}")

with open(CONFIG_FILE, 'r') as f:
    print(f.read())
connector.check_connection()

api:
  uri: http://localhost
  port: 80

keycloak:
    uri: http://localhost
    port: 8080
    realm: bird
    client: io-instrumentation


False

### Search workflows by name

In [3]:
searcher = WorkflowSearcher(connector)
df = searcher.search_workflows("LQCD_WFM_ABC") # CryoEM #Cryo_EM #LQCD_WFM_ABC #ecmwf-kronos #ecmwf-large-workflow
print(df.shape)
df.head()

[32m2023-07-21 10:03:04.654[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending POST request to endpoint /ioi/workflows/[0m


(30, 10)


Unnamed: 0,id,instanceNumber,name,username,version,jobs,workflowStartTime,workflowEndTime,workflowDuration,accumulatedJobDuration
0,647f1a39cc9340246f200df6,wfm-bm-2023-06-06-133501-2023-06-06_13:35:16,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366130}, {'jobid': 366132}, {'jobid...",1686051385,1686053504,2119,1275
1,647f38abcc9340246f20ca5d,wfm-bm-2023-06-06-154357-2023-06-06_15:44:13,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366197}, {'jobid': 366238}, {'jobid...",1686059175,1686063537,0,1195
2,647f501dcc9340246f216efe,wfm-bm-2023-06-06-172429-2023-06-06_17:24:45,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366351}, {'jobid': 366352}, {'jobid...",1686065180,1686067460,2280,1750
3,648038bbcc9340246f23f837,wfm-bm-2023-06-07-095536-2023-06-07_09:55:52,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366391}, {'jobid': 366392}, {'jobid...",1686124730,1686128160,3430,1260
4,648047d1cc9340246f24a56e,wfm-bm-2023-06-07-110208-2023-06-07_11:02:23,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366802}, {'jobid': 366812}, {'jobid...",1686128590,1686136178,7588,1250


### Extract workflows timeseries 

In [4]:
# Extract workflows data
workflows = [searcher.extract_workflow_data(workflow_id) for workflow_id in df['id']]

# Convert each workflow data into the format expected by CentralJob
central_jobs = {}
for workflow in workflows:
    # Check if workflow data is None before proceeding
    if workflow is not None:
        for workflow_id, workflow_data in workflow.items():
            try:
                job = {
                    'bytesRead': np.array(workflow_data['bytesRead']),
                    'bytesWritten': np.array(workflow_data['bytesWritten'])
                }
                central_jobs[workflow_id] = job

            except:
                print(f"Cannot process {workflow_id} data")
    else:
        print("Workflow data is None, skipping...")


[32m2023-07-21 10:03:04.960[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/647f1a39cc9340246f200df6[0m
[32m2023-07-21 10:03:05.373[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/647f38abcc9340246f20ca5d[0m
[32m2023-07-21 10:03:05.679[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/647f501dcc9340246f216efe[0m
[32m2023-07-21 10:03:06.077[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/648038bbcc9340246f23f837[0m
[32m2023-07-21 10:03:06.426[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[3

### Central Job Detection

In [5]:
# Use the CentralJob class to find the central job id for all workflows
central_job = CentralJob(central_jobs)
central_job_id = central_job.find_central_job()

# Print the id of the central job
print('Central job ID:', central_job_id)

# Extract corresponding workflow using its id
central_workflow_data = central_jobs[central_job_id]
print('Central workflow ID:', central_job_id)
#print('Central workflow data:', central_workflow_data)



[32m2023-07-21 10:03:15.143[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36m__init__[0m:[36m149[0m - [1mInitializing CentralJob instance[0m
[32m2023-07-21 10:03:15.145[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfind_central_job[0m:[36m228[0m - [1mFinding central job[0m
[32m2023-07-21 10:03:15.146[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mprocess[0m:[36m194[0m - [1mProcessing jobs data[0m
[32m2023-07-21 10:03:15.147[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfft_features[0m:[36m169[0m - [1mCalculating FFT features[0m
[32m2023-07-21 10:03:15.147[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfft_features[0m:[36m169[0m - [1mCalculating FFT features[0m
[32m2023-07-21 10:03:15.149[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfft_features[0m:[36m169[0m - [1mCalculating FFT features[0m
[32m2023-07-21 10:03:15.150[0m | [1mINFO   

Central job ID: 6486ebc0cc9340246f3cc477
Central workflow ID: 6486ebc0cc9340246f3cc477


### A lookup at the features matrix

In [6]:
features = central_job._features
features.head()
#print(list(features.columns))

Unnamed: 0_level_0,bytesRead_min,bytesRead_max,bytesRead_mean,bytesRead_dft_0,bytesRead_dft_1,bytesRead_dft_2,bytesRead_dft_3,bytesRead_dft_4,bytesRead_dft_5,bytesRead_dft_6,...,bytesWritten_dft_10,bytesWritten_dft_11,bytesWritten_dft_12,bytesWritten_dft_13,bytesWritten_dft_14,bytesWritten_dft_15,bytesWritten_dft_16,bytesWritten_dft_17,bytesWritten_dft_18,bytesWritten_dft_19
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
647f1a39cc9340246f200df6,0.0,0.77816,0.545499,0.057782,0.066183,0.102511,0.217935,0.318474,0.382053,0.156776,...,3.5e-05,2.5e-05,1.6e-05,1.8e-05,1.3e-05,2e-06,5e-06,1.3e-05,4.5e-05,1.7e-05
647f38abcc9340246f20ca5d,0.0,0.424231,1.0,1.0,1.0,1.0,0.964053,0.88272,1.0,0.599502,...,7.1e-05,9.5e-05,6.8e-05,0.000126,0.000139,9.7e-05,9.8e-05,0.000157,0.000451,0.000149
647f501dcc9340246f216efe,0.0,0.422646,0.582481,1e-06,1e-06,2e-06,4e-06,5e-06,6e-06,2e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
648038bbcc9340246f23f837,0.0,0.41848,0.466313,0.049407,0.055686,0.082871,0.166795,0.225912,0.242163,0.084626,...,0.97807,0.640047,0.318834,0.48227,1.0,0.567671,0.387993,0.395185,1.0,0.478847
648047d1cc9340246f24a56e,0.0,0.418978,0.461796,0.012513,0.014122,0.021093,0.042687,0.058305,0.063259,0.022432,...,0.084828,0.081266,0.076323,0.07419,0.079676,0.139196,0.143649,0.149033,0.041714,0.155221


### Display workflows in reduced space 2D/3D

In [7]:
fig = display_features(central_job._features, dim_reduction="PCA", 
                       central_job_id=central_job_id,
                       n_components=3)
fig.show()


### Display worklfows timeseries

In [8]:
fig2 = display_timeseries(workflows)
fig2.show()