Skip to content

Commit

Permalink
[LAB-470] 442 tool author attribution to iojson (#599)
Browse files Browse the repository at this point in the history
Co-authored-by: Aakaash Meduri <aakaash.meduri@gmail.com>
  • Loading branch information
hevans66 and acashmoney committed Aug 18, 2023
1 parent 4604cce commit a6fc994
Show file tree
Hide file tree
Showing 20 changed files with 180 additions and 154 deletions.
76 changes: 40 additions & 36 deletions internal/ipwl/testdata/example_tool.json
Original file line number Diff line number Diff line change
@@ -1,39 +1,43 @@
{
"class": "CommandLineTool",
"name": "equibind",
"description": "Docking of small molecules to a protein",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"python main.py --protein $(inputs.protein.filepath) --small_molecule_library $(inputs.small_molecule.filepath);",
"mv /outputs/ligands_predicted.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
"cp $(inputs.protein.filepath) /outputs/;",
"rmdir /outputs/dummy;"
],
"dockerPull": "ghcr.io/labdao/equibind@sha256:ae2cec63b3924774727ed1c6c8af95cf4aaea2d3f0c5acbec56478505ccb2b07",
"gpuBool": false,
"networkBool": false,
"inputs": {
"protein": {
"type": "File",
"item": "",
"glob": ["*.pdb"]
},
"small_molecule": {
"type": "File",
"item": "",
"glob": ["*.sdf", "*.mol2"]
}
"class": "CommandLineTool",
"name": "equibind",
"description": "Docking of small molecules to a protein",
"author": "@misc{stärk2022equibind,\n title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n year={2022},\n eprint={2202.05146},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"mkdir -p /tmp-inputs/tmp;",
"mkdir -p /tmp-outputs/tmp;",
"cp /inputs/* /tmp-inputs/tmp/;",
"ls /tmp-inputs/tmp;",
"cd /src && python /src/inference.py --config=/src/configs_clean/bacalhau.yml;",
"mv /tmp-outputs/tmp/* /outputs/;",
"mv /outputs/lig_equibind_corrected.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
"mv /tmp-inputs/tmp/*.pdb /outputs/;"],
"dockerPull": "ghcr.io/labdao/equibind:main@sha256:21a381d9ab1ff047565685044569c8536a55e489c9531326498b28d6b3cc244f",
"gpuBool": false,
"networkBool": false,
"inputs": {
"protein": {
"type": "File",
"item": "",
"glob": ["*.pdb"]
},
"outputs": {
"best_docked_small_molecule": {
"type": "File",
"item": "",
"glob": ["*_docked.sdf"]
},
"protein": {
"type": "File",
"item": "",
"glob": ["*.pdb"]
}
"small_molecule": {
"type": "File",
"item": "",
"glob": ["*.sdf", "*.mol2"]
}
}
},
"outputs": {
"best_docked_small_molecule": {
"type": "File",
"item": "",
"glob": ["*_docked.sdf", "*_docked.mol2"]
},
"protein": {
"type": "File",
"item": "",
"glob": ["*.pdb"]
}
}
}
1 change: 1 addition & 0 deletions internal/ipwl/tool.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type ToolOutput struct {
type Tool struct {
Name string `json:"name"`
Description string `json:"description"`
Author string `json:"author"`
BaseCommand []string `json:"baseCommand"`
Arguments []string `json:"arguments"`
DockerPull string `json:"dockerPull"`
Expand Down
17 changes: 11 additions & 6 deletions internal/ipwl/tool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,20 @@ func TestReadToolConfig(t *testing.T) {
filePath := "testdata/example_tool.json"
expected := Tool{
Name: "equibind",
Author: "@misc{stärk2022equibind,\n title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n year={2022},\n eprint={2202.05146},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
Description: "Docking of small molecules to a protein",
BaseCommand: []string{"/bin/bash", "-c"},
Arguments: []string{
"python main.py --protein $(inputs.protein.filepath) --small_molecule_library $(inputs.small_molecule.filepath);",
"mv /outputs/ligands_predicted.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
"cp $(inputs.protein.filepath) /outputs/;",
"rmdir /outputs/dummy;",
"mkdir -p /tmp-inputs/tmp;",
"mkdir -p /tmp-outputs/tmp;",
"cp /inputs/* /tmp-inputs/tmp/;",
"ls /tmp-inputs/tmp;",
"cd /src && python /src/inference.py --config=/src/configs_clean/bacalhau.yml;",
"mv /tmp-outputs/tmp/* /outputs/;",
"mv /outputs/lig_equibind_corrected.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
"mv /tmp-inputs/tmp/*.pdb /outputs/;",
},
DockerPull: "ghcr.io/labdao/equibind@sha256:ae2cec63b3924774727ed1c6c8af95cf4aaea2d3f0c5acbec56478505ccb2b07",
DockerPull: "ghcr.io/labdao/equibind:main@sha256:21a381d9ab1ff047565685044569c8536a55e489c9531326498b28d6b3cc244f",
GpuBool: false,
Inputs: map[string]ToolInput{
"protein": {
Expand All @@ -32,7 +37,7 @@ func TestReadToolConfig(t *testing.T) {
Outputs: map[string]ToolOutput{
"best_docked_small_molecule": {
Type: "File",
Glob: []string{"*_docked.sdf"},
Glob: []string{"*_docked.sdf", "*_docked.mol2"},
},
"protein": {
Type: "File",
Expand Down
6 changes: 3 additions & 3 deletions python/src/plex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ class ScatteringMethod(Enum):


class CoreTools(Enum):
EQUIBIND = "QmZ2HarAgwZGjc3LBx9mWNwAQkPWiHMignqKup1ckp8NhB"
DIFFDOCK = "QmSzetFkveiQYZ5FgpZdHHfsjMWYz5YzwMAvqUgUFhFPMM"
EQUIBIND = "QmZWYpZXsrbtzvBCHngh4YEgME5djnV5EedyTpc8DrK7k2"
DIFFDOCK = "QmfKhJh48aDHgckzwGEASNmZd1SYstQiR5qLqqYmLQFzq9"
COLABFOLD_MINI = "QmcRH74qfqDBJFku3mEDGxkAf6CSpaHTpdbe1pMkHnbcZD"
COLABFOLD_STANDARD = "QmXnM1VpdGgX5huyU3zTjJovsu42KPfWhjxhZGkyvy9PVk"
COLABFOLD_LARGE = "QmPYqMy19VFFuYztL6b5ruo4Kw4JWT583emStGrSYTH5Yi"
BAM2FASTQ = "QmbPUirWiWCv9sgdHLekf5AnoCdw4QPU2SyfGGKs9JRRbq"
ODDT = "QmUx7NdxkXXZvbK1JXZVUYUBqsevWkbVxgTzpWJ4Xp4inf"
RFDIFFUSION = "QmXnCBCtoYuPyGsEJVpjn5regHfFSYa8kx44e22XxDX2t2"
RFDIFFUSION = "QmTyFGjt2oqTLGQRE5u8mtfiQNft5nzMsieYdvwnpfk3HJ"
REPEATMODELER = "QmZdXxnUt1sFFR39CfkEUgiioUBf6qP5CUs8TCb7Wqn4MC"
GNINA = "QmZiQWEXj3aMRnJLoU39HHcknMDfKQD2txpfk6ubJAdDRx"
BATCH_DLKCAT = "QmQTjvP2utNb1JTtUHeQ8mQPvNkCTg5VRc4LVdptWkUcJ7"
Expand Down
1 change: 1 addition & 0 deletions tools/bam2fastq.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "bam2fastq",
"description": "Sort BAM by qname and Extract Fasta reads R1 R2 with RG using samtools",
"author": "",
"inputs": {
"genome": {
"type": "File",
Expand Down
1 change: 1 addition & 0 deletions tools/blender/blender.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "blender",
"description": "let's create some fancy protein graphics",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"blender --background --python app.py -- $(inputs.protein.filepath) /outputs/protein.png"
Expand Down
1 change: 1 addition & 0 deletions tools/colabfold-large.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "colabfold-large",
"description": "Protein folding prediction using Colabfold (large settings)",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"colabfold_batch --templates --num-recycle $(inputs.recycle.default) --use-gpu-relax --amber /inputs /outputs;"
Expand Down
1 change: 1 addition & 0 deletions tools/colabfold-mini.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "colabfold-mini",
"description": "Protein folding prediction using Colabfold (mini settings)",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"colabfold_batch --templates --max-msa 32:64 --num-recycle $(inputs.recycle.default) /inputs /outputs;"
Expand Down
1 change: 1 addition & 0 deletions tools/colabfold-standard.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "colabfold-standard",
"description": "Protein folding prediction using Colabfold (standard settings)",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"colabfold_batch --templates --num-recycle $(inputs.recycle.default) /inputs /outputs;"
Expand Down
121 changes: 61 additions & 60 deletions tools/diffdock.json
Original file line number Diff line number Diff line change
@@ -1,63 +1,64 @@
{
"class": "CommandLineTool",
"name": "diffdock",
"description": "Docking of small molecules to a protein",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"python datasets/esm_embedding_preparation.py --protein_path $(inputs.protein.filepath) --out_file /outputs/prepared_for_esm.fasta;",
"HOME=esm/model_weights python esm/scripts/extract.py esm2_t33_650M_UR50D /outputs/prepared_for_esm.fasta /outputs/esm2_output --repr_layers $(inputs.repr_layers.default) --include per_tok && cp -r /outputs/esm2_output data/esm2_output;",
"python -m inference --protein_path $(inputs.protein.filepath) --ligand $(inputs.small_molecule.filepath) --out_dir /outputs --inference_steps $(inputs.inference_steps.default) --samples_per_complex $(inputs.samples_per_complex.default) --batch_size $(inputs.batch_size.default) --actual_steps $(inputs.actual_steps.default) --no_final_step_noise;",
"cp $(inputs.protein.filepath) /outputs"
],
"dockerPull": "ghcr.io/labdao/diffdock:main@sha256:b00432de73478d3da578e4a16ee669178828109f3c7bf9c58d44bb7514f68629",
"gpuBool": true,
"networkBool": true,
"memoryGB": 12,
"inputs": {
"protein": {
"type": "File",
"glob": ["*.pdb"]
},
"small_molecule": {
"type": "File",
"glob": ["*.sdf", "*.mol2"]
},
"repr_layers": {
"type": "int",
"default": "33"
},
"inference_steps": {
"type": "int",
"default": "20"
},
"samples_per_complex": {
"type": "int",
"default": "40"
},
"batch_size": {
"type": "int",
"default": "10"
},
"actual_steps": {
"type": "int",
"default": "18"
}
},
"outputs": {
"best_docked_small_molecule": {
"type": "File",
"item": "",
"glob": ["index*/rank1.sdf"]
},
"all_docked_small_molecules": {
"type": "Array",
"item": "File",
"glob": ["index*/rank*.sdf"]
},
"protein": {
"type": "File",
"item": "",
"glob": ["*.pdb"]
}
"class": "CommandLineTool",
"name": "diffdock",
"description": "Docking of small molecules to a protein",
"author": "@misc{corso2023diffdock,\n title={DiffDock: Diffusion Steps, Twists, and Turns for Molecular Docking},\n author={Gabriele Corso and Hannes Stärk and Bowen Jing and Regina Barzilay and Tommi Jaakkola},\n year={2023},\n eprint={2210.01776},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"python datasets/esm_embedding_preparation.py --protein_path $(inputs.protein.filepath) --out_file /outputs/prepared_for_esm.fasta;",
"HOME=esm/model_weights python esm/scripts/extract.py esm2_t33_650M_UR50D /outputs/prepared_for_esm.fasta /outputs/esm2_output --repr_layers $(inputs.repr_layers.default) --include per_tok && cp -r /outputs/esm2_output data/esm2_output;",
"python -m inference --protein_path $(inputs.protein.filepath) --ligand $(inputs.small_molecule.filepath) --out_dir /outputs --inference_steps $(inputs.inference_steps.default) --samples_per_complex $(inputs.samples_per_complex.default) --batch_size $(inputs.batch_size.default) --actual_steps $(inputs.actual_steps.default) --no_final_step_noise;",
"cp $(inputs.protein.filepath) /outputs"
],
"dockerPull": "ghcr.io/labdao/diffdock:main@sha256:b00432de73478d3da578e4a16ee669178828109f3c7bf9c58d44bb7514f68629",
"gpuBool": true,
"networkBool": true,
"memoryGB": 12,
"inputs": {
"protein": {
"type": "File",
"glob": ["*.pdb"]
},
"small_molecule": {
"type": "File",
"glob": ["*.sdf", "*.mol2"]
},
"repr_layers": {
"type": "int",
"default": "33"
},
"inference_steps": {
"type": "int",
"default": "20"
},
"samples_per_complex": {
"type": "int",
"default": "40"
},
"batch_size": {
"type": "int",
"default": "10"
},
"actual_steps": {
"type": "int",
"default": "18"
}
},
"outputs": {
"best_docked_small_molecule": {
"type": "File",
"item": "",
"glob": ["index*/rank1.sdf"]
},
"all_docked_small_molecules": {
"type": "Array",
"item": "File",
"glob": ["index*/rank*.sdf"]
},
"protein": {
"type": "File",
"item": "",
"glob": ["*.pdb"]
}
}
}
1 change: 1 addition & 0 deletions tools/dlkcat/batch_dlkcat.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "dlkcat",
"description": "batch predict enzyme catalytic activity from a protein sequence and molecule smile",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"conda run -n env python prediction_for_input.py $(inputs.input_tsv.filepath) && mv output.tsv /outputs/"
Expand Down
3 changes: 2 additions & 1 deletion tools/equibind.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "equibind",
"description": "Docking of small molecules to a protein",
"author": "@misc{stärk2022equibind,\n title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n year={2022},\n eprint={2202.05146},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"mkdir -p /tmp-inputs/tmp;",
Expand Down Expand Up @@ -39,4 +40,4 @@
"glob": ["*.pdb"]
}
}
}
}
1 change: 1 addition & 0 deletions tools/fastqc/fastqc.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "Tool",
"name": "fastqc",
"description": "Comprehensive quality control tool for high-throughput sequence data",
"author": "",
"doi": "https://doi.org/10.48550/arXiv.2202.05146",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
Expand Down
1 change: 1 addition & 0 deletions tools/gnina/gnina.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "gnina",
"description": "Protein-ligand docking using Gnina",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"gnina -r $(inputs.protein.filepath) -l $(inputs.small_molecule.filepath) --exhaustiveness $(inputs.exhaustiveness.default) --autobox_ligand $(inputs.protein.filepath) --cnn_scoring $(inputs.cnn_scoring.default) -o /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked_scored.sdf"
Expand Down
1 change: 1 addition & 0 deletions tools/oddt.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "oddt",
"description": "Scoring of protein-ligand complexes using ODDT",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"mkdir -p /tmp-out && oddt_cli $(inputs.small_molecule.filepath) --receptor $(inputs.protein.filepath) --score rfscore_v1 --score rfscore_v2 --score rfscore_v3 --score nnscore -O /tmp-out/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_scored.$(inputs.small_molecule.ext) && cd /tmp-out && /app/aggregate_score.sh && cp /tmp-out/* /outputs"
Expand Down
1 change: 1 addition & 0 deletions tools/openbabel/pdb-to-sdf-openbabel.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "pdb to sdf",
"description": "Convert pdb to sdf using openbabel.",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"obabel $(inputs.pdb_file.filepath) -O /outputs/$(inputs.pdb_file.basename).sdf;"
Expand Down
1 change: 1 addition & 0 deletions tools/openbabel/rmsd-openbabel.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "rmsd",
"description": "calculate the RMSD of a reference small molecule and a docked small molecule using openbabel.",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"echo 'reference,comparison,RMSD' > /outputs/rmsd.csv && echo -n '$(inputs.reference_structure.basename),$(inputs.comparison_structure.basename),' > /outputs/temp.csv && obrms -firstonly $(inputs.reference_structure.filepath) $(inputs.comparison_structure.filepath) | awk '{print $2}' | tr -d '\\n' >> /outputs/temp.csv && cat /outputs/temp.csv >> /outputs/rmsd.csv && rm /outputs/temp.csv;"
Expand Down
1 change: 1 addition & 0 deletions tools/protbert/protbert.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"class": "CommandLineTool",
"name": "protbert",
"description": "Predicting unknown residues with protein language models",
"author": "",
"baseCommand": ["/bin/bash", "-c"],
"arguments": [
"python3 app.py $(inputs.protein_sequence.filepath) /outputs --mode fill-mask;",
Expand Down

0 comments on commit a6fc994

Please sign in to comment.