### General Imports

In [18]:
import sys
sys.path.append("../")
import os
import unittest
from shutil import rmtree
import urllib3
from loguru import logger
from pydantic import BaseModel
from typing import List
from app_decomposer.connector import Connector, APIConnector, ApiModel
from app_decomposer.config_parser import Configuration
import unittest
import numpy as np
import pandas as pd
from unittest.mock import MagicMock
from app_decomposer.central_job import WorkflowSynthesizer, CentralJob, WorkflowSearcher,display_features, display_timeseries

CONFIG_FILE = os.path.join(os.path.dirname(os.getcwd()), "tests_integration",
                      "test_data", "test_docker_config.yaml")

### IOI Connection

In [19]:
config = Configuration(path=CONFIG_FILE)
keycloak_token = config.get_kc_token_docker()
connector = APIConnector(api_uri="http://localhost/pybackend/",
                              api_token=f"Bearer {keycloak_token}")

with open(CONFIG_FILE, 'r') as f:
    print(f.read())
connector.check_connection()

api:
  uri: http://localhost
  port: 80

keycloak:
    uri: http://localhost
    port: 8080
    realm: bird
    client: io-instrumentation


False

### Search workflows by name

In [20]:
searcher = WorkflowSearcher(connector)
df = searcher.search_workflows("LQCD_WFM_ABC") # CryoEM #Cryo_EM #LQCD_WFM_ABC #ecmwf-kronos #ecmwf-large-workflow
print(df.shape)
df.head()

[32m2023-07-19 12:21:27.517[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending POST request to endpoint /ioi/workflows/[0m


(30, 10)


Unnamed: 0,id,instanceNumber,name,username,version,jobs,workflowStartTime,workflowEndTime,workflowDuration,accumulatedJobDuration
0,647f1a39cc9340246f200df6,wfm-bm-2023-06-06-133501-2023-06-06_13:35:16,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366130}, {'jobid': 366132}, {'jobid...",1686051385,1686053504,2119,1275
1,647f38abcc9340246f20ca5d,wfm-bm-2023-06-06-154357-2023-06-06_15:44:13,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366197}, {'jobid': 366238}, {'jobid...",1686059175,1686063537,0,1195
2,647f501dcc9340246f216efe,wfm-bm-2023-06-06-172429-2023-06-06_17:24:45,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366351}, {'jobid': 366352}, {'jobid...",1686065180,1686067460,2280,1750
3,648038bbcc9340246f23f837,wfm-bm-2023-06-07-095536-2023-06-07_09:55:52,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366391}, {'jobid': 366392}, {'jobid...",1686124730,1686128160,3430,1260
4,648047d1cc9340246f24a56e,wfm-bm-2023-06-07-110208-2023-06-07_11:02:23,LQCD_WFM_ABC,gregory1,4.0.5-Bull.9,"[{'jobid': 366802}, {'jobid': 366812}, {'jobid...",1686128590,1686136178,7588,1250


### Extract workflows timeseries 

In [21]:
# Extract workflows data
workflows = [searcher.extract_workflow_data(workflow_id) for workflow_id in df['id']]

# Convert each workflow data into the format expected by CentralJob
central_jobs = {}
for workflow in workflows:
    # Check if workflow data is None before proceeding
    if workflow is not None:
        for workflow_id, workflow_data in workflow.items():
            try:
                job = {
                    'bytesRead': np.array(workflow_data['bytesRead']),
                    'bytesWritten': np.array(workflow_data['bytesWritten'])
                }
                central_jobs[workflow_id] = job

            except:
                print(f"Cannot process {workflow_id} data")
    else:
        print("Workflow data is None, skipping...")


[32m2023-07-19 12:21:27.827[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/647f1a39cc9340246f200df6[0m
[32m2023-07-19 12:21:28.233[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/647f38abcc9340246f20ca5d[0m
[32m2023-07-19 12:21:28.663[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/647f501dcc9340246f216efe[0m
[32m2023-07-19 12:21:29.045[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[36mrequest_delegator[0m:[36m167[0m - [34m[1mSending GET request to endpoint /ioi/series/workflow/648038bbcc9340246f23f837[0m
[32m2023-07-19 12:21:29.398[0m | [34m[1mDEBUG   [0m | [36mapp_decomposer.connector[0m:[3

### Central Job Detection

In [22]:
# Use the CentralJob class to find the central job id for all workflows
central_job = CentralJob(central_jobs)
central_job_id = central_job.find_central_job()

# Print the id of the central job
print('Central job ID:', central_job_id)

# Extract corresponding workflow using its id
central_workflow_data = central_jobs[central_job_id]
print('Central workflow ID:', central_job_id)
#print('Central workflow data:', central_workflow_data)



[32m2023-07-19 12:21:38.486[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36m__init__[0m:[36m149[0m - [1mInitializing CentralJob instance[0m
[32m2023-07-19 12:21:38.487[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfind_central_job[0m:[36m228[0m - [1mFinding central job[0m
[32m2023-07-19 12:21:38.488[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mprocess[0m:[36m194[0m - [1mProcessing jobs data[0m
[32m2023-07-19 12:21:38.489[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfft_features[0m:[36m169[0m - [1mCalculating FFT features[0m
[32m2023-07-19 12:21:38.490[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfft_features[0m:[36m169[0m - [1mCalculating FFT features[0m
[32m2023-07-19 12:21:38.491[0m | [1mINFO    [0m | [36mapp_decomposer.central_job[0m:[36mfft_features[0m:[36m169[0m - [1mCalculating FFT features[0m
[32m2023-07-19 12:21:38.492[0m | [1mINFO   

Central job ID: 64874581cc9340246f41d21b
Central workflow ID: 64874581cc9340246f41d21b


### A lookup at the featuers matrix

In [23]:
features = central_job._features
features.head()
#print(list(features.columns))

Unnamed: 0_level_0,bytesRead_min,bytesRead_max,bytesRead_mean,bytesRead_dft_0,bytesRead_dft_1,bytesRead_dft_2,bytesRead_dft_3,bytesRead_dft_4,bytesRead_dft_5,bytesRead_dft_6,...,bytesWritten_dft_10,bytesWritten_dft_11,bytesWritten_dft_12,bytesWritten_dft_13,bytesWritten_dft_14,bytesWritten_dft_15,bytesWritten_dft_16,bytesWritten_dft_17,bytesWritten_dft_18,bytesWritten_dft_19
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
647f1a39cc9340246f200df6,0.0,0.462977,0.910377,0.910144,0.903566,0.882827,0.84611,0.793205,0.726588,0.650332,...,2.2e-05,1e-05,7e-06,1.3e-05,7e-06,2e-06,6e-06,1.1e-05,1e-05,8e-06
647f38abcc9340246f20ca5d,0.0,0.686183,0.627272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
647f501dcc9340246f216efe,0.0,0.487178,0.813649,1.3e-05,1.3e-05,1.3e-05,1.4e-05,1.4e-05,1.4e-05,1.4e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
648038bbcc9340246f23f837,0.0,0.686079,0.842795,0.842795,0.835659,0.812736,0.770878,0.708241,0.625913,0.526795,...,0.697547,1.0,0.794418,0.769837,0.823757,0.942224,0.630817,0.900335,0.254552,0.893483
648047d1cc9340246f24a56e,0.0,1.0,0.746317,0.190861,0.189343,0.184418,0.175299,0.161456,0.143017,0.120542,...,0.233519,0.124384,0.097139,0.242939,0.138318,0.016631,0.171099,0.366881,0.32552,0.270567


### Display workflows in reduced space 2D/3D

In [24]:
fig = display_features(central_job._features, dim_reduction="t-SNE", 
                       central_job_id=central_job_id,
                       n_components=2)
fig.show()


### Display worklfows timeseries

In [25]:
fig2 = display_timeseries(workflows)
fig2.show()