In [1]:
import boto3
import sagemaker
import boto3
import sagemaker
from time import gmtime, strftime, sleep
from sagemaker.deserializers import CSVDeserializer
from sagemaker.serializers import CSVSerializer
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import (
    ProcessingInput, 
    ProcessingOutput, 
    ScriptProcessor,
    FrameworkProcessor
)
from sagemaker.pytorch.processing import PyTorchProcessor

from sagemaker.inputs import TrainingInput

from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import (
    ProcessingStep, 
    TuningStep,
    TrainingStep, 
    CreateModelStep
)
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.parameters import (
    ParameterInteger, 
    ParameterFloat, 
    ParameterString, 
    ParameterBoolean
)
from sagemaker.workflow.clarify_check_step import (
    ModelBiasCheckConfig, 
    ClarifyCheckStep, 
    ModelExplainabilityCheckConfig
)
from sagemaker import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.conditions import (
    ConditionGreaterThan,
    ConditionLessThan,
    ConditionGreaterThanOrEqualTo
)
from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import (
    Join,
    JsonGet
)

from sagemaker.lambda_helper import Lambda

from sagemaker.model_metrics import (
    MetricsSource, 
    ModelMetrics, 
    FileSource
)
from sagemaker.drift_check_baselines import DriftCheckBaselines

from sagemaker.image_uris import retrieve
iam = boto3.client('iam')
from sagemaker.pytorch import PyTorch

sagemaker.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/ldodda/Library/Application Support/sagemaker/config.yaml


'2.223.0'

In [2]:
sm_role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20211206T145568')['Role']['Arn']

INFO:botocore.tokens:Loading cached SSO token for discovery_account


In [3]:
#!aws s3 cp ./datasets s3://nimbustx-sagemaker/denovo_design/s4dd/ --recursive

In [4]:
%%writefile scripts/pretraining.py
from s4dd import S4forDenovoDesign
from argparse import ArgumentParser
import os
if __name__ == "__main__":
    parser = ArgumentParser('(Multitask) Regression')
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    #parser.add_argument("--full-data", type=str, default=os.environ["SM_CHANNEL_DATA_FULL"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test", type=str, default=os.environ["SM_CHANNEL_TEST"])
    args = parser.parse_args().__dict__

    # Create an S4 model
    s4 = S4forDenovoDesign(
        n_max_epochs=400,  # This is for only demonstration purposes. Set this to a (much) higher value for actual training. Default: 400
        batch_size=2048,  # This is for only demonstration purposes. The value in the paper is 2048.
        device="cuda",  # replace this with "cpu" if you don't have a CUDA-enabled GPU
    )
    # Pretrain the model on a small subset of ChEMBL
    s4.train(
        training_molecules_path=f"{args['train']}/train.zip",
        val_molecules_path=f"{args['test']}/valid.zip",
    )
    # Save the model
    s4.save(args['model_dir'])

Overwriting scripts/pretraining.py


In [5]:
%%writefile scripts/all_together.py
from s4dd import S4forDenovoDesign
from argparse import ArgumentParser
import os
if __name__ == "__main__":
    parser = ArgumentParser('(Multitask) Regression')
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    #parser.add_argument("--full-data", type=str, default=os.environ["SM_CHANNEL_DATA_FULL"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test", type=str, default=os.environ["SM_CHANNEL_TEST"])
    parser.add_argument("--output",type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    args = parser.parse_args().__dict__

    # Create an S4 model with (almost) the same parameters as in the paper.
    s4 = S4forDenovoDesign(
        n_max_epochs=400,  # This is for only demonstration purposes. Set this to a (much) higher value for actual training. Default: 400.
        batch_size=2048,  # This is for only demonstration purposes. The value in the paper is 2048.
        device="cuda",  # replace this with "cpu" if you don't have a CUDA-enabled GPU.
    )
    # Pretrain the model on a small subset of ChEMBL
    s4.train(
        training_molecules_path=f"{args['train']}/chemblv31/train.zip",
        val_molecules_path=f"{args['test']}/chemblv31/valid.zip",
    )

    # save the pretrained model
    s4.save(f"{args['model_dir']}")

    # Fine-tune the model on a small subset of bioactive molecules
    s4.train(
        training_molecules_path=f"{args['train']}/pkm2/train.zip",
        val_molecules_path=f"{args['train']}/pkm2/valid.zip",
    )

    # save the fine-tuned model
    s4.save(f"{args['model_dir']}")


    # Design new molecules
    designs, lls = s4.design_molecules(n_designs=128, batch_size=64, temperature=1)

    # Save the designs
    with open(f"{args.output}/designs.smiles", "w") as f:
        f.write("\n".join(designs))

    # Save the log-likelihoods of the designs
    with open(f"{args.output}/lls.txt", "w") as f:
        f.write("\n".join([str(ll) for ll in lls]))


Overwriting scripts/all_together.py


In [6]:
train_estimator = PyTorch(
        entry_point='pretraining.py',
        source_dir="scripts",
        role=sm_role,
        framework_version='1.13.1',
        instance_count=1,
        instance_type='ml.g4dn.2xlarge',
        py_version='py39',
        max_run=432000,
        wait=False
    )

In [7]:
all_estimator = PyTorch(
        entry_point='all_together.py',
        source_dir="scripts",
        role=sm_role,
        framework_version='1.13.1',
        instance_count=1,
        instance_type='ml.g4dn.2xlarge',
        py_version='py39',
        max_run=432000,
        wait=False
    )

In [8]:
train_estimator.fit({'train': 's3://nimbustx-sagemaker/denovo_design/s4dd/chemblv31/', 
                     'test': 's3://nimbustx-sagemaker/denovo_design/s4dd/chemblv31/'})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-08-27-20-45-15-013


2024-08-27 20:45:16 Starting - Starting the training job...
2024-08-27 20:45:32 Starting - Preparing the instances for training...
2024-08-27 20:45:59 Downloading - Downloading input data...
2024-08-27 20:46:20 Downloading - Downloading the training image..........

In [None]:
all_estimator.fit({'train': 's3://nimbustx-sagemaker/denovo_design/s4dd/',
                     'test': 's3://nimbustx-sagemaker/denovo_design/s4dd/'})