Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@ FastFold provides a **high-performance implementation of Evoformer** with the fo
3. Ease of use
* Huge performance gains with a few lines changes
* You don't need to care about how the parallel part is implemented
4. Faster data processing, about 3x times faster than the original way

## Installation

To install and use FastFold, you will need:
+ Python 3.8 or later
+ Python 3.8 or 3.9.
+ [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 11.1 or above
+ PyTorch 1.10 or above


For now, You can install FastFold:
### Using Conda (Recommended)

Expand Down Expand Up @@ -116,6 +118,32 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--hhsearch_binary_path `which hhsearch` \
--kalign_binary_path `which kalign`
```
or run the script `./inference.sh`, you can change the parameter in the script, especisally those data path.
```shell
./inference.sh
```

#### inference with data workflow
Alphafold's data pre-processing takes a lot of time, so we speed up the data pre-process by [ray](https://docs.ray.io/en/latest/workflows/concepts.html) workflow, which achieves a 3x times faster speed. To run the intference with ray workflow, you should install the package and add parameter `--enable_workflow` to cmdline or shell script `./inference.sh`
```shell
pip install ray==1.13.0 pyarrow
```
```shell
python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \
--gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \
--hhsearch_binary_path `which hhsearch` \
--kalign_binary_path `which kalign` \
--enable_workflow
```


## Performance Benchmark

Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN conda install pytorch==1.10.0 torchvision torchaudio cudatoolkit=11.3 -c pyt
&& conda install hmmer==3.3.2 hhsuite=3.3.0 kalign2=2.04 -c bioconda

RUN pip install biopython==1.79 dm-tree==0.1.6 ml-collections==0.1.0 numpy==1.21.2 \
PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops
PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops ray==1.13.0 pyarrow

RUN pip install colossalai==0.1.8+torch1.10cu11.3 -f https://release.colossalai.org

Expand Down
1 change: 1 addition & 0 deletions fastfold/workflow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .workflow_run import batch_run
5 changes: 5 additions & 0 deletions fastfold/workflow/factory/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .task_factory import TaskFactory
from .hhblits import HHBlitsFactory
from .hhsearch import HHSearchFactory
from .jackhmmer import JackHmmerFactory
from .hhfilter import HHfilterFactory
29 changes: 29 additions & 0 deletions fastfold/workflow/factory/hhblits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from ray import workflow
from typing import List
from fastfold.workflow.factory import TaskFactory
from ray.workflow.common import Workflow
import fastfold.data.tools.hhblits as ffHHBlits

class HHBlitsFactory(TaskFactory):

keywords = ['binary_path', 'databases', 'n_cpu']

def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# setup runner
runner = ffHHBlits.HHBlits(
binary_path=self.config['binary_path'],
databases=self.config['databases'],
n_cpu=self.config['n_cpu']
)

# generate step function
@workflow.step
def hhblits_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
result = runner.query(fasta_path)
with open(output_path, "w") as f:
f.write(result["a3m"])

return hhblits_step.step(fasta_path, output_path, after)
33 changes: 33 additions & 0 deletions fastfold/workflow/factory/hhfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import subprocess
import logging
from ray import workflow
from typing import List
from fastfold.workflow.factory import TaskFactory
from ray.workflow.common import Workflow

class HHfilterFactory(TaskFactory):

keywords = ['binary_path']

def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# generate step function
@workflow.step
def hhfilter_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:

cmd = [
self.config.get('binary_path'),
]
if 'id' in self.config:
cmd += ['-id', str(self.config.get('id'))]
if 'cov' in self.config:
cmd += ['-cov', str(self.config.get('cov'))]
cmd += ['-i', fasta_path, '-o', output_path]

logging.info(f"HHfilter start: {' '.join(cmd)}")
Copy link
Contributor Author

@Gy-Lu Gy-Lu Aug 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this logging conflict with print?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no


subprocess.run(cmd)

return hhfilter_step.step(fasta_path, output_path, after)
38 changes: 38 additions & 0 deletions fastfold/workflow/factory/hhsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from fastfold.workflow.factory import TaskFactory
from ray import workflow
from ray.workflow.common import Workflow
import fastfold.data.tools.hhsearch as ffHHSearch
from typing import List

class HHSearchFactory(TaskFactory):

keywords = ['binary_path', 'databases', 'n_cpu']

def gen_task(self, a3m_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# setup runner
runner = ffHHSearch.HHSearch(
binary_path=self.config['binary_path'],
databases=self.config['databases'],
n_cpu=self.config['n_cpu']
)

# generate step function
@workflow.step
def hhsearch_step(a3m_path: str, output_path: str, after: List[Workflow], atab_path: str = None) -> None:

with open(a3m_path, "r") as f:
a3m = f.read()
if atab_path:
hhsearch_result, atab = runner.query(a3m, gen_atab=True)
else:
hhsearch_result = runner.query(a3m)
with open(output_path, "w") as f:
f.write(hhsearch_result)
if atab_path:
with open(atab_path, "w") as f:
f.write(atab)

return hhsearch_step.step(a3m_path, output_path, after)
34 changes: 34 additions & 0 deletions fastfold/workflow/factory/jackhmmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from fastfold.workflow.factory import TaskFactory
from ray import workflow
from ray.workflow.common import Workflow
import fastfold.data.tools.jackhmmer as ffJackHmmer
from fastfold.data import parsers
from typing import List

class JackHmmerFactory(TaskFactory):

keywords = ['binary_path', 'database_path', 'n_cpu', 'uniref_max_hits']

def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# setup runner
runner = ffJackHmmer.Jackhmmer(
binary_path=self.config['binary_path'],
database_path=self.config['database_path'],
n_cpu=self.config['n_cpu']
)

# generate step function
@workflow.step
def jackhmmer_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
result = runner.query(fasta_path)[0]
uniref90_msa_a3m = parsers.convert_stockholm_to_a3m(
result['sto'],
max_sequences=self.config['uniref_max_hits']
)
with open(output_path, "w") as f:
f.write(uniref90_msa_a3m)

return jackhmmer_step.step(fasta_path, output_path, after)
50 changes: 50 additions & 0 deletions fastfold/workflow/factory/task_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from ast import keyword
import json
from ray.workflow.common import Workflow
from os import path
from typing import List

class TaskFactory:

keywords = []

def __init__(self, config: dict = None, config_path: str = None) -> None:

# skip if no keyword required from config file
if not self.__class__.keywords:
return

# setting config for factory
if config is not None:
self.config = config
elif config_path is not None:
self.loadConfig(config_path)
else:
self.loadConfig()

def configure(self, config: dict, purge=False) -> None:
if purge:
self.config = config
else:
self.config.update(config)

def configure(self, keyword: str, value: any) -> None:
self.config[keyword] = value

def gen_task(self, after: List[Workflow]=None, *args, **kwargs) -> Workflow:
raise NotImplementedError

def isReady(self):
for key in self.__class__.keywords:
if key not in self.config:
raise KeyError(f"{self.__class__.__name__} not ready: \"{key}\" not specified")

def loadConfig(self, config_path='./config.json'):
with open(config_path) as configFile:
globalConfig = json.load(configFile)
if 'tools' not in globalConfig:
raise KeyError("\"tools\" not found in global config file")
factoryName = self.__class__.__name__[:-7]
if factoryName not in globalConfig['tools']:
raise KeyError(f"\"{factoryName}\" not found in the \"tools\" section in config")
self.config = globalConfig['tools'][factoryName]
1 change: 1 addition & 0 deletions fastfold/workflow/template/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .fastfold_data_workflow import FastFoldDataWorkFlow
Loading