In [43]:
import boto3
import sagemaker
import sagemaker.session
from sagemaker import get_execution_role
from sagemaker.tuner import IntegerParameter, HyperparameterTuner
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.estimator import Estimator
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.inputs import TrainingInput



In [4]:
region = boto3.session.Session().region_name
role = get_execution_role()
sagemaker_session = sagemaker.session.Session()

### Pipeline Parameters

In [19]:
processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

processing_instance_type = ParameterString(
    name = "processingInstanceType",
    default_value='m1.m5.xlarge'
)

training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.xlarge"
)

congress = '110'

### Define Processing Step

In [11]:
processing_repository_uri = "064258348567.dkr.ecr.us-east-1.amazonaws.com/sagemaker-processing-container:latest"

In [12]:
script_processor = ScriptProcessor(command=['python3'],
                image_uri=processing_repository_uri,
                role=role,
                instance_count=processing_instance_count,
                instance_type=processing_instance_type)

In [31]:
step_process = ProcessingStep(
    name = "congressProcess",
    processor=script_processor,
    inputs=[ProcessingInput(
            source='s3://ascsagemaker/JMP_congressional_nmf/raw_data',
            destination='/opt/ml/processing/input')],
    outputs=[ProcessingOutput(
            name='train',
            source='/opt/ml/processing/output',
            destination='s3://ascsagemaker/JMP_congressional_nmf/dtms')],
    code = 'containers/processing-container/scripts/congress_pre_process.py',
    job_arguments=[congress]
)

### Define Training Step

In [15]:
training_repository_uri = "064258348567.dkr.ecr.us-east-1.amazonaws.com/sagemaker-nmf-container:latest"

In [20]:
hyperparameter_ranges = {"k":IntegerParameter(30,34)}
objective_metric_name = 'coherence'
metric_definitions = [{'Name': 'coherence',
                       'Regex': 'coherence: ([0-9])'}]



In [25]:
nmf = Estimator(training_repository_uri,
                              role,1, training_instance_type,
                              output_path = 's3://ascsagemaker/JMP_congressional_nmf/models',
                              sagemaker_session=sagemaker_session)

tuning_nmf = HyperparameterTuner(nmf,
                                objective_metric_name,
                                hyperparameter_ranges,
                                metric_definitions=metric_definitions,
                                max_jobs=3,
                                max_parallel_jobs=3,
                                base_tuning_job_name=f'congress-{congress}')

In [44]:
step_train = TrainingStep(
    name = 'congressTrain',
    estimator=tuning_nmf,
    inputs = TrainingInput(step_process.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri)
)

### Define Pipeline

In [45]:
pipeline_name = "congressPipeline"
pipeline = Pipeline(
    name = pipeline_name,
    parameters=[
        processing_instance_type,
        processing_instance_count,
        training_instance_type
    ],
    steps=[step_process,step_train]
)

# BIG ISSUE:
Currently AWS does not support the `HyperparameterTuning` method in workflow pipelines... See this git issue https://github.com/aws/sagemaker-python-sdk/issues/2060

In [46]:
import json

In [47]:
json.loads(pipeline.definition())

AttributeError: 'HyperparameterTuner' object has no attribute '_prepare_for_training'