# AlphaFold in Vertex 

In [14]:
import os
import pprint
import time

from google.cloud.aiplatform.gapic import \
    JobServiceClient

In [15]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'
ALPHAFOLD_DATASETS_BUCKET_NAME = 'jk-alphafold-datasets'
STAGING_BUCKET_NAME = 'jk-alphafold-staging'

API_ENDPOINT = '{}-aiplatform.googleapis.com'.format(REGION)
PARENT = "projects/" + PROJECT + "/locations/" + REGION

ALPHA_FOLD_IMAGE_NAME = f'gcr.io/{PROJECT}/alphafold'

In [16]:
!gsutil ls gs://{ALPHAFOLD_DATASETS_BUCKET_NAME}

gs://jk-alphafold-datasets/bfd/
gs://jk-alphafold-datasets/mgnify/
gs://jk-alphafold-datasets/params/
gs://jk-alphafold-datasets/pdb70/
gs://jk-alphafold-datasets/pdb_mmcif/
gs://jk-alphafold-datasets/uniclust30/
gs://jk-alphafold-datasets/uniref90/


## 

## Create a job

In [17]:
DOWNLOAD_DIR = os.path.join('/gcs', ALPHAFOLD_DATASETS_BUCKET_NAME)

data_dir = DOWNLOAD_DIR

# Path to the Uniref90 database for use by JackHMMER.
uniref90_database_path = os.path.join(
    DOWNLOAD_DIR, 'uniref90', 'uniref90.fasta')

# Path to the MGnify database for use by JackHMMER.
mgnify_database_path = os.path.join(
    DOWNLOAD_DIR, 'mgnify', 'mgy_clusters_2018_08.fa')

# Path to the BFD database for use by HHblits.
bfd_database_path = os.path.join(
    DOWNLOAD_DIR, 'bfd',
    'bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt')

# Path to the Small BFD database for use by JackHMMER.
small_bfd_database_path = os.path.join(
    DOWNLOAD_DIR, 'small_bfd', 'bfd-first_non_consensus_sequences.fasta')

# Path to the Uniclust30 database for use by HHblits.
uniclust30_database_path = os.path.join(
    DOWNLOAD_DIR, 'uniclust30', 'uniclust30_2018_08', 'uniclust30_2018_08')

# Path to the PDB70 database for use by HHsearch.
pdb70_database_path = os.path.join(DOWNLOAD_DIR, 'pdb70', 'pdb70')

# Path to a directory with template mmCIF structures, each named <pdb_id>.cif')
template_mmcif_dir = os.path.join(DOWNLOAD_DIR, 'pdb_mmcif', 'mmcif_files')

# Path to a file mapping obsolete PDB IDs to their replacements.
obsolete_pdbs_path = os.path.join(DOWNLOAD_DIR, 'pdb_mmcif', 'obsolete.dat')


database_paths = [
      ('uniref90_database_path', uniref90_database_path),
      ('mgnify_database_path', mgnify_database_path),
      ('pdb70_database_path', pdb70_database_path),
      ('data_dir', data_dir),
      ('template_mmcif_dir', template_mmcif_dir),
      ('obsolete_pdbs_path', obsolete_pdbs_path),
      ('uniclust30_database_path', uniclust30_database_path),
      ('bfd_database_path', bfd_database_path),
  ]

output_target_path = os.path.join('/gcs', STAGING_BUCKET_NAME, 'output')


model_names = [
    'model_1',
    'model_2',
    'model_3',
    'model_4',
    'model_5',
]

target_fasta_paths = ['/gcs/jk-alphafold-staging/fasta/T1050.fasta']

In [18]:
print(uniref90_database_path)
print(mgnify_database_path)
print(bfd_database_path)
print(small_bfd_database_path)
print(uniclust30_database_path)
print(pdb70_database_path)
print(template_mmcif_dir)
print(obsolete_pdbs_path)
print(output_target_path)
print(target_fasta_paths)


/gcs/jk-alphafold-datasets/uniref90/uniref90.fasta
/gcs/jk-alphafold-datasets/mgnify/mgy_clusters_2018_08.fa
/gcs/jk-alphafold-datasets/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
/gcs/jk-alphafold-datasets/small_bfd/bfd-first_non_consensus_sequences.fasta
/gcs/jk-alphafold-datasets/uniclust30/uniclust30_2018_08/uniclust30_2018_08
/gcs/jk-alphafold-datasets/pdb70/pdb70
/gcs/jk-alphafold-datasets/pdb_mmcif/mmcif_files
/gcs/jk-alphafold-datasets/pdb_mmcif/obsolete.dat
/gcs/jk-alphafold-staging/output
['/gcs/jk-alphafold-staging/fasta/T1050.fasta']


In [19]:
max_template_date = '2020-05-14'
preset = 'full_dbs'
benchmark = False

command_args = []

command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}')

for name, path in database_paths:
    if path:
      command_args.append(f'--{name}={path}')

command_args.extend([
      f'--output_dir={output_target_path}',
      f'--model_names={",".join(model_names)}',
      f'--max_template_date={max_template_date}',
      f'--preset={preset}',
      f'--benchmark={benchmark}',
      '--logtostderr',
  ])

In [20]:
command_args

['--fasta_paths=/gcs/jk-alphafold-staging/fasta/T1050.fasta',
 '--uniref90_database_path=/gcs/jk-alphafold-datasets/uniref90/uniref90.fasta',
 '--mgnify_database_path=/gcs/jk-alphafold-datasets/mgnify/mgy_clusters_2018_08.fa',
 '--pdb70_database_path=/gcs/jk-alphafold-datasets/pdb70/pdb70',
 '--data_dir=/gcs/jk-alphafold-datasets',
 '--template_mmcif_dir=/gcs/jk-alphafold-datasets/pdb_mmcif/mmcif_files',
 '--obsolete_pdbs_path=/gcs/jk-alphafold-datasets/pdb_mmcif/obsolete.dat',
 '--uniclust30_database_path=/gcs/jk-alphafold-datasets/uniclust30/uniclust30_2018_08/uniclust30_2018_08',
 '--bfd_database_path=/gcs/jk-alphafold-datasets/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt',
 '--output_dir=/gcs/jk-alphafold-staging/output',
 '--model_names=model_1,model_2,model_3,model_4,model_5',
 '--max_template_date=2020-05-14',
 '--preset=full_dbs',
 '--benchmark=False',
 '--logtostderr']

In [27]:
env_vars=[
    {
        'name': 'NVIDIA_VISIBLE_DEVICES',
        'value': 'all',
    },
    # The following flags allow us to make predictions on proteins that
    # would typically be too long to fit into GPU memory.
    {
        'name': 'TF_FORCE_UNIFIED_MEMORY',
        'value': '1'
    },
    
    {
        'name': 'XLA_PYTHON_CLIENT_MEM_FRACTION',
        'value': '4.0'
    },
]

In [33]:
worker_pool_specs =  [
    {
        "machine_spec": {
            #"machine_type": "a2-highgpu-1g",
            #"accelerator_type": "NVIDIA_TESLA_A100",
            "machine_type": "n1-standard-16",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 2,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": ALPHA_FOLD_IMAGE_NAME,
            "args": command_args,
            "env": env_vars,
        },
    }
]


job_name = 'ALPHAFOLD_{}'.format(time.strftime("%Y%m%d_%H%M%S"))

custom_job_spec = {
    'display_name': job_name,
    'job_spec': {
        'worker_pool_specs': worker_pool_specs
    }
}

pp = pprint.PrettyPrinter()
print(pp.pformat(custom_job_spec))

{'display_name': 'ALPHAFOLD_20210726_223716',
 'job_spec': {'worker_pool_specs': [{'container_spec': {'args': ['--fasta_paths=/gcs/jk-alphafold-staging/fasta/T1050.fasta',
                                                                 '--uniref90_database_path=/gcs/jk-alphafold-datasets/uniref90/uniref90.fasta',
                                                                 '--mgnify_database_path=/gcs/jk-alphafold-datasets/mgnify/mgy_clusters_2018_08.fa',
                                                                 '--pdb70_database_path=/gcs/jk-alphafold-datasets/pdb70/pdb70',
                                                                 '--data_dir=/gcs/jk-alphafold-datasets',
                                                                 '--template_mmcif_dir=/gcs/jk-alphafold-datasets/pdb_mmcif/mmcif_files',
                                                                 '--obsolete_pdbs_path=/gcs/jk-alphafold-datasets/pdb_mmcif/obsolete.dat',
                       

In [35]:
options = dict(api_endpoint=API_ENDPOINT)
client = JobServiceClient(client_options=options)

response = client.create_custom_job(
    parent=PARENT, custom_job=custom_job_spec
)

response

name: "projects/895222332033/locations/us-central1/customJobs/4883322565092704256"
display_name: "ALPHAFOLD_20210726_223716"
job_spec {
  worker_pool_specs {
    machine_spec {
      machine_type: "n1-standard-16"
      accelerator_type: NVIDIA_TESLA_T4
      accelerator_count: 2
    }
    replica_count: 1
    disk_spec {
      boot_disk_type: "pd-ssd"
      boot_disk_size_gb: 100
    }
    container_spec {
      image_uri: "gcr.io/jk-mlops-dev/alphafold"
      args: "--fasta_paths=/gcs/jk-alphafold-staging/fasta/T1050.fasta"
      args: "--uniref90_database_path=/gcs/jk-alphafold-datasets/uniref90/uniref90.fasta"
      args: "--mgnify_database_path=/gcs/jk-alphafold-datasets/mgnify/mgy_clusters_2018_08.fa"
      args: "--pdb70_database_path=/gcs/jk-alphafold-datasets/pdb70/pdb70"
      args: "--data_dir=/gcs/jk-alphafold-datasets"
      args: "--template_mmcif_dir=/gcs/jk-alphafold-datasets/pdb_mmcif/mmcif_files"
      args: "--obsolete_pdbs_path=/gcs/jk-alphafold-datasets/pdb_mmcif/o