# Nextflow optimizer notebook

The current objective of this notebook is to:
1. Load execution trace of a Nextflow workflow.
2. Extract timing information of the different tasks executed.
3. (Optionnaly) Visualize the extracted information, similarly to what is done in nextflow reports
4. Generate Nextflow config file overriding the process time limit with the worst-case execution time observed.

In future version, it might be useful to maintain a database of process runtimes to better understand how this runtime evolves depending on its parameterization, or depending on the node used to run it.

## 1. Notebook parameters

In [None]:
from pathlib import Path

# Path to the report and dag files.
#nf_report_path = Path("C:/Users/kdesnos/Desktop/Sandbox/pipelines/CELEBI_20250214","karol_210912_ult_2025-02-21_15_23_50")
nf_report_path = Path("C:/Users/kdesnos/Desktop/Sandbox/pipelines/","execution_2025-02-14_10-15-33")
# karol_210912_ult_2025-02-14_12_57_21

# Output config file
output_config_file = Path("C:/Users/kdesnos/Desktop/Sandbox/pipelines/CELEBI_20250214","karol_210912_ult.config")

# Create the HTML and DAG files
html_report = nf_report_path.with_name(nf_report_path.name + "_report.html")
dag_report = nf_report_path.with_name(nf_report_path.name + "_dag.html")

## 2. Load data

### 2.1 Load data from HTML

In [None]:
import extract_trace_from_html as parser

trace_df = parser.extract_trace_data(html_report)

if trace_df is not None:
    print(f"Extracted {trace_df.shape[0]} process execution traces.")

### 2.2 Process dataframe

In [None]:
# Extract process name and path within the workflow as separate columns.
trace_df['process_name'] = trace_df['process'].str.split(':').str[-1]
trace_df['process_path'] = trace_df['process'].str.split(':').str[:-1].str.join(':')

## 3. Display useful info

### 3.1 Process execution time box plot

In [None]:
import visualization as visualizer

name_filter = None # Optionnaly a string can be given to the viewer to display only processes containing this string
                   # Use None if no filter is wanted

visualizer.plot_realtime_boxplot(trace_df, name_filter)

### 3.2 Icicle chart of processes

In [None]:
import visualization as visualizer

visualizer.plot_icicle_chart(trace_df, include_names=True)

### 3.3 Processing times

In [None]:
sum = trace_df['realtime'].sum()
sum_cpu = (trace_df['realtime'] * trace_df['cpus']).sum()

print(f'Sum of all process execution time: {sum}')
print(f'Sum of all (process exec time)*(nb cpu): {sum_cpu}')

### 3.4 Average wait time

In [None]:
import visualization as visualizer

visualizer.plot_wait_times(trace_df)

## 4. Export Config File

In [None]:
import config_file_generator as generator

generator.generate_nextflow_config(trace_df, output_config_file)

## 5. Load DAG

In [None]:
import re
import networkx as nx
from pathlib import Path

def extract_mermaid_graph(dag_report: Path) -> nx.DiGraph:
    """
    Extracts a Mermaid DAG from an HTML file and returns it as a NetworkX graph.

    Parameters:
    - dag_report (Path): The path to the HTML file containing the Mermaid DAG.

    Returns:
    - nx.DiGraph: The NetworkX directed graph representing the DAG.
    """
    # Read the HTML file
    with open(dag_report, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Regular expression to extract the Mermaid graph definition
    mermaid_pattern = re.compile(r'<pre class="mermaid" style="text-align: center;">(.*?)</pre>', re.DOTALL)
    mermaid_match = mermaid_pattern.search(html_content)

    if not mermaid_match:
        raise ValueError("Mermaid graph definition not found in the HTML file.")

    mermaid_graph = mermaid_match.group(1).strip()

    # Regular expressions to extract nodes, edges, and subgraphs
    node_pattern = re.compile(r'(\w+)[\[|\()|\"]+(.*?)[\"|\]|\)]+')
    edge_pattern = re.compile(r'(\w+) --> (\w+)')
    subgraph_pattern = re.compile(r'subgraph\s+(\w+|".*?")', re.DOTALL)

    # Create a directed graph
    G = nx.DiGraph()

    # Extract nodes and their names
    nodes = node_pattern.findall(mermaid_graph)
    node_names = {node: name for node, name in nodes}

    for node, name in nodes:
        G.add_node(node, name=name)

    # Extract edges
    edges = edge_pattern.findall(mermaid_graph)
    for source, target in edges:
        G.add_edge(source, target)

    # Extract subgraphs
    subgraph_lines = mermaid_graph.splitlines()
    subgraph_name = None
    in_subgraph = False
    anonymous_subgraph_count = 0

    for line in subgraph_lines:
        line = line.strip()
        if line.startswith('subgraph'):
            if in_subgraph:
                # Process the previous subgraph
                subgraph_name = None
            subgraph_name_match = subgraph_pattern.match(line)
            if subgraph_name_match:
                subgraph_name = subgraph_name_match.group(1).strip('"')
                if subgraph_name == " ":
                    subgraph_name = f'unnamed_{anonymous_subgraph_count}'
                    anonymous_subgraph_count += 1
            in_subgraph = True
        elif line == 'end' and in_subgraph:
            in_subgraph = False
        elif in_subgraph:
            node_match = node_pattern.match(line)
            if node_match:
                node, name = node_match.groups()
                G.add_node(node, name=name, subgraph=subgraph_name)

    return G


# Example usage
dag = extract_mermaid_graph(dag_report)

# Print nodes and edges in a simpler format
print(f"{dag.number_of_nodes()} Nodes:", dag.nodes(data=True))
print(f"{dag.number_of_edges()} Edges:", dag.edges())

lay = enumerate(nx.topological_generations(dag))
for layer, nodes in lay:
    for node in nodes:
        dag.nodes[node]["layer"] = layer

nx.draw(dag, pos = nx.multipartite_layout(dag, subset_key="layer"), with_labels=True)
