# convert plan to nextflow

In [24]:
from pathlib import Path
from metasmith.solver import WorkflowSolver
from metasmith.models.libraries import DataInstanceLibrary, DataTypeLibrary, TransformInstanceLibrary

from local.constants import WORKSPACE_ROOT

In [25]:
lib = DataTypeLibrary.Load(WORKSPACE_ROOT/"main/local_mock/prototypes/metagenomics.yml")
trlib = TransformInstanceLibrary.Load([
    Path("./transforms/simple_1"),
    # Path("./transforms/dupe_test"),
])
ilib_path = Path("./cache/test.yml")
ilib = DataInstanceLibrary.Load(ilib_path)

In [26]:
solver = WorkflowSolver(trlib)
plan = solver.Solve(
    [
        ilib["contigs"],
        ilib["diamond_reference.uniprot_sprot"],
    ],
    [
        lib.types["orf_annotations"],
    ]
)

In [44]:
WS = Path("./cache/ws1/nextflow")
WS.mkdir(exist_ok=True)
TAB = " "*4

wf_path = WS/"workflow.nf"
process_definitions = []
workflow_definition = []
target_endpoints = {e for x, e in plan.targets}
for step in plan.steps:
    name = f"{step.transform.source.stem}__{step.key}"
    src = [f"process {name}"+" {"]

    to_pubish = [x for x, e in step.produces if e in target_endpoints]
    for x in to_pubish:
        src.append(TAB+f'publishDir "$params.output", mode: "copy", pattern: "{x.source}"')
    if len(to_pubish)>0:
        src.append("") # newline
    
    src += [
        TAB+"input:",
    ] + [
        TAB+TAB+f'path _{e.key}' for x, e in step.uses
    ] + [
        "",
        TAB+"output:",
    ] + [
        TAB+TAB+f'path "{x.source}"' for x, e in step.produces
    ] + [
        "",
        TAB+'script:',
        TAB+'"""',
    ] + [
        TAB+'echo "${_'+e.key+'},'+f'{x.type.name}" >>inputs.txt' for x, e in step.uses
    ] + [
        TAB+f'echo "{x.source},{x.type.name}" >>outputs.txt' for x, e in step.produces
    ] + [
        TAB+f'echo "{step.transform.source}" >>transform.txt',
        TAB+f'bash {Path("./entry.sh").resolve()}',
        TAB+'"""',
        "}"
    ]

    output_vars = [f"_{e.key}" for x, e in step.produces]
    output_vars = ', '.join(output_vars)
    if len(step.produces) > 1:
        output_vars = f"({output_vars})"
    input_vars = [f"_{e.key}" for x, e in step.uses]
    input_vars = ', '.join(input_vars)
    process_definitions.append("\n".join(src))
    workflow_definition.append(TAB+f'{output_vars} = {name}({input_vars})')

workflow_definition = [
    "workflow {"
    ] + [
        TAB+f'_{e.key}'+' = Channel.fromPath(params.given_'+f'{e.key}'+')' for x, e in plan.given
    ] + [
        "",
    ] + workflow_definition + [
        "}",
    ]

wf_contents = [
    "\n\n".join(process_definitions),
    "\n\n",
    "\n".join(workflow_definition),
    "\n",
]
wf_contents = ''.join(wf_contents)
with open(wf_path, "w") as f:
    f.write(wf_contents)

print(wf_contents)

process pprodigal__VMBv {
    container "docker://quay.io/hallamlab/external_pprodigal:1.0.1"

    input:
        path _AlhL

    output:
        path "orfs.faa"
        path "orfs.gbk"

    script:
    """
    echo "${_AlhL},contigs" >>inputs.txt
    echo "orfs.faa,orfs_faa" >>outputs.txt
    echo "orfs.gbk,orfs_gbk" >>outputs.txt
    echo "/home/tony/workspace/tools/Metasmith/main/local_mock/transforms/simple_1/pprodigal.py" >>transform.txt
    bash /home/tony/workspace/tools/Metasmith/main/local_mock/entry.sh
    """
}

process diamond__5leR {
    container "docker://bschiffthaler/diamond:2.0.14"
    publishDir "$params.output", mode: "copy", pattern: "annotations.csv"

    input:
        path _4yRt
        path _KEYf

    output:
        path "annotations.csv"

    script:
    """
    echo "${_4yRt},diamond_protein_reference" >>inputs.txt
    echo "${_KEYf},orfs_faa" >>inputs.txt
    echo "annotations.csv,orf_annotations" >>outputs.txt
    echo "/home/tony/workspace/tools/Metasmith

In [46]:
import os

# nextflow -C ../config.nf -log ./.nextflow_logs/log \
#     run ../test.2.nf \
#     --given_OVtA given/2beaver_fosmid_seqs.fna given/contigs.fna \
#     --given_bAYL given/swissprot_fastal_ref \
#     --account asdf

param_given = [f"--given_{e.key} {x.source}" for x, e in plan.given]
param_given = ' '.join(param_given)
os.system(f"""\
cd ./cache/ws1/nextflow
nextflow -C ../../../config/nxf_local.nf \
    -log {(WS/"logs").resolve()}/log \
    run {wf_path.resolve()} \
    -work-dir {(WS/"work").resolve()} \
    {param_given}
""")

[33mNextflow 24.10.4 is available - Please consider updating your version to it[m



 N E X T F L O W   ~  version 24.10.2

Launching `/home/tony/workspace/tools/Metasmith/main/local_mock/cache/ws1/nextflow/workflow.nf` [sick_booth] DSL2 - revision: 7bd0186e73

Plus 2 more processes waiting for tasks…

executor >  local (1)
[42/66c413] pprod | 0 of 1
Plus 1 more processes waiting for tasks…

executor >  local (1)
[42/66c413] pprod | 0 of 1
Plus 1 more processes waiting for tasks…

executor >  local (1)
[42/66c413] pprod | 0 of 1
Plus 1 more processes waiting for tasks…
ERROR ~ Error executing process > 'pprodigal__VMBv (1)'

Caused by:
  Missing output file(s) `orfs.faa` expected by process `pprodigal__VMBv (1)`


Command executed:

  echo "example.fna,contigs" >>inputs.txt
  echo "orfs.faa,orfs_faa" >>outputs.txt
  echo "orfs.gbk,orfs_gbk" >>outputs.txt
  echo "/home/tony/workspace/tools/Metasmith/main/local_mock/transforms/simple_1/pprodigal.py" >>transform.txt
  bash /home/tony/workspace/tools/Metasmith/main/local_mock/entry.sh

Command exit status:
  0

Command ou

256

In [32]:
step.produces

[(DataInstance(source=PosixPath('annotations.csv'), type=DataType(name='orf_annotations', properties={'format': 'CSV', 'data': 'Protein features'}, library=DataTypeLibrary(path=PosixPath('/home/tony/workspace/tools/Metasmith/main/local_mock/prototypes/metagenomics.yml'), schema=0.3, ontology={'doi': 'https://doi.org/10.1093/bioinformatics/btt113', 'name': 'EDAM', 'version': 1.25, 'strict': False}, types={'contigs': DataType(name='contigs', properties={'format': 'FASTA', 'data': 'DNA sequence'}, library=...), 'orfs_gbk': DataType(name='orfs_gbk', properties={'format': 'GenBank', 'data': 'Protein features'}, library=...), 'orfs_faa': DataType(name='orfs_faa', properties={'format': 'FASTA', 'data': 'Amino acid sequence'}, library=...), 'diamond_protein_reference': DataType(name='diamond_protein_reference', properties={'format': 'Binary format', 'data': 'database reference', 'reference_type': 'diamond db'}, library=...), 'fastal_protein_reference': DataType(name='fastal_protein_reference',

In [33]:
plan.produces

AttributeError: 'WorkflowPlan' object has no attribute 'produces'