### Importing Libraries and Configurations

In [1]:
import os
import glob
import json
import boto3
import sagemaker


# Taking pipeline building configurations from config.json.
# These are only for building and will not be available at 
# the runtime of the pipeline.
with open("config.json") as file:
    build_parameters = json.load(file)

### Setting Default Bucket for the Pipeline and getting region and role

In [2]:
# Setting default bucket
sagemaker_session = sagemaker.session.Session(default_bucket = build_parameters["output_bucket"])

# Getting region and role
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

print(role)
print(sagemaker_session)
print(sagemaker_session.default_bucket)

arn:aws:iam::852619674999:role/service-role/AmazonSageMaker-ExecutionRole-20220427T124311
<sagemaker.session.Session object at 0x7f5f180d8240>
<bound method Session.default_bucket of <sagemaker.session.Session object at 0x7f5f180d8240>>


### Input Parameters

#### Input Data location

In [3]:

# Default location for the datasets
train_data_uri = build_parameters["train_data"]
test_data_uri = build_parameters["test_data"]
evaluation_data_uri = build_parameters["evaluation_data"]
feature_selection_file_uri = build_parameters["feature_selection"]


# Parametrizing Data paths
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
train_data = ParameterString(name="TrainData", default_value = train_data_uri)
test_data = ParameterString(name="TestData", default_value = test_data_uri)
evaluation_data = ParameterString(name="EvaluationData", default_value = evaluation_data_uri)
feature_selection_file = ParameterString(name = "FeatureSelectionFile", default_value = feature_selection_file_uri)


#### Input Model Locations (If Any)

In [4]:
model_given = ParameterString(name="ModelGiven", default_value = "No")
model_s3_path = ParameterString(name="ModelPath", default_value = "-")

### Machine types

In [None]:
# Preprocessing machine type
local_preprocessing_path = os.path.join("Pipeline_Component_Codes","Training","1_Preprocessing")
with open(os.path.join(local_preprocessing_path, "config.json")) as file:
    processing_build_parameters = json.load(file)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value=processing_build_parameters["machine_type"]
)


# Modeling machine types
local_models_path = os.path.join("Pipeline_Component_Codes", "Training", "2_Models_HPTune")
models = [dir if '.' not in dir for dir in os.listdir(local_models_path)]
model_details = []
for i in range(len(models)):
    name = models[i]
    with open(os.path.join(local_models_path, name, "config.json")) as file:
        build_parameters = json.load(file)
    model_details.append({"name":name, "build_parameters":build_parameters})

for i in range(len(model_details)):
    training_instance_type = ParameterString(
        name=f"{model_details[i]['name']}InstanceType",
        default_value="ml.m5.xlarge"
    )

### Handling output

In [8]:
## Handling the output location
# Default output location
# pipeline_s3_output_bucket = f"{usecase}-output-bucket-{region}" 
# pipeline_s3_output_bucket = build_parameters["output_bucket"]
pipeline_output_bucket = build_parameters["output_bucket"] 

# Making the output location runtime parameter
# pipeline_output_bucket = ParameterString(name = "PipelineOutputBucket", default_value = pipeline_s3_output_bucket) 
sagemaker_session.default_bucket = pipeline_output_bucket

# Creating the output bucket if it is not already present
s3 = boto3.client('s3')
buckets = [dictionary["Name"] for dictionary in s3.list_buckets()['Buckets']]
if pipeline_output_bucket not in buckets:
    location = {'LocationConstraint': region}
    response = s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration = location)


from time import gmtime, strftime
pipeline_start_time = strftime("%Y%m%d-%H-%M-%S", gmtime())

from sagemaker.workflow import functions

# These variables were written thinking that output path can be taken as parameter, yes it can be done,
# but not all the pipeline steps accepts pipeline parameter as input, so we had to pick the output path from config
# file instead of as parameter
# processing_output_path = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", pipeline_start_time, "ProcessingOutput"])
# evaluation_processing_output_path = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", pipeline_start_time, "EvaluationProcessingOutput"])
# # hptune_training_output_path = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", pipeline_start_time, "HPTuneTrainingOutput"])
# hptune_training_output_path = f"s3://{pipeline_s3_output_bucket}/Training_Pipeline_Output/{pipeline_start_time}/HPTuneTrainingOutput"
# evaluation_output_path = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", pipeline_start_time, "EvaluationOutput"])


processing_output_path = f"s3://{pipeline_output_bucket}/Training_Pipeline_Output/{pipeline_start_time}/ProcessingOutput"
evaluation_processing_output_path = f"s3://{pipeline_output_bucket}/Training_Pipeline_Output/{pipeline_start_time}/EvaluationProcessingOutput"
# hptune_training_output_path = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", pipeline_start_time, "HPTuneTrainingOutput"])
hptune_training_output_path = f"s3://{pipeline_output_bucket}/Training_Pipeline_Output/{pipeline_start_time}/HPTuneTrainingOutput"
evaluation_output_path = f"s3://{pipeline_output_bucket}/Training_Pipeline_Output/{pipeline_start_time}/EvaluationOutput"


### Preprocessing Training Data

In [5]:

# 2.1 Loading preprocessing config.json file.
local_preprocessing_path = os.path.join("Pipeline_Component_Codes","Training","1_Preprocessing")
with open(os.path.join(local_preprocessing_path, "config.json")) as file:
    processing_build_parameters = json.load(file)

    
# 2.2 Making parameter for processing machine type
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value=processing_build_parameters["machine_type"]
)


In [19]:
processing_build_parameters

{'processing_type': 'sklearn_processing',
 'framework_version': '0.23-1',
 'entry_point': 'Training_Preprocessing.py',
 'dependencies': 'requirements.txt',
 'machine_type': 'ml.m4.xlarge',
 'machine_count': 1}

#### 2.3 Building the processor

In [7]:

if processing_build_parameters["processing_type"] == "sklearn":
    from sagemaker.sklearn.processing import SKLearnProcessor
    processor = SKLearnProcessor(
        framework_version = processing_build_parameters["framework_version"],
        instance_type = processing_build_parameters["machine_type"],
        instance_count = processing_build_parameters["machine_count"],
        base_job_name = f"{build_parameters['usecase']}-preprocessing",
        role=role
    )
elif processing_build_parameters["processing_type"] == "custom":
    from sagemaker.processor import Processor
    processor = Processor(
        image_uri = processing_build_parameters["image_uri"],
        instance_type = processing_build_parameters["machine_type"],
        instance_count = processing_build_parameters["machine_count"],
        base_job_name = f"{build_parameters['usecase']}-preprocessing",
        role=role
    )

#### 2.4 Building preprocessing step

In [10]:

from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
    

step_process = ProcessingStep(
    name = "preprocessing_full_data",
    description = "Data preprocessing and splitting into train and test set",
    processor=processor,
    inputs=[
        ProcessingInput(source = train_data, destination="/opt/ml/processing/input/data"),  
        ProcessingInput(source=feature_selection_file, destination="/opt/ml/processing/input/feature_selection")
    ],
    outputs=[
        # Train
        ProcessingOutput(output_name = "train", source="/opt/ml/processing/train", 
#                          destination = sagemaker.workflow.functions.Join(on='/', values = [processing_output_path, "data"])
                        ),
        # Test
        ProcessingOutput(output_name = "test", source="/opt/ml/processing/test", 
#                          destination = sagemaker.workflow.functions.Join(on='/', values = [processing_output_path, "data"])
                        ),
        # Logs
        ProcessingOutput(output_name = "logs", source="/opt/ml/processing/logss", 
#                          destination = sagemaker.workflow.functions.Join(on='/', values = [processing_output_path, "logs"])
                        ),
    ],
#     code="SageMaker_Pipeline_Component_Codes/Training/Training_Preprocessing.py",
    code=os.path.join(local_preprocessing_path, processing_build_parameters["entry_point"]),
    job_arguments = ["--train_data_location", "/opt/ml/processing/input/data", 
                     "--feature_selection_file_location", "/opt/ml/processing/input/feature_selection", 
                     "--target_column", "Churn",
                     "--preprocessed_train_data_location", "/opt/ml/processing/train", 
                     "--preprocessed_test_data_location", "/opt/ml/processing/test", 
                     "--log_location", "/opt/ml/processing/logss"
                    ]
)


### Preprocessing Evaluation Data

In [11]:
sklearn_processor_evaluation = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=build_parameters["processing_instance_type"],
    instance_count=build_parameters["processing_instance_count"],
    base_job_name=f"{usecase}-preprocessing-validation",
    role=role
)

step_process_evaluation = ProcessingStep(
    name="preprocessing_validation_data",
    # processor=sklearn_processor_evaluation,
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=evaluation_data, destination="/opt/ml/processing/input/data"), 
        ProcessingInput(source=feature_selection_file, destination="/opt/ml/processing/input/feature_selection")
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train", destination = sagemaker.workflow.functions.Join(on='/', values = [evaluation_processing_output_path, "data"])),
        ProcessingOutput(output_name="logs", source="/opt/ml/processing/logss", destination = sagemaker.workflow.functions.Join(on='/', values = [evaluation_processing_output_path, "logs"]))
    ],
    # code="SageMaker_Pipeline_Component_Codes/Training/Training_Preprocessing.py",
    code = f"s3://{pipeline_input_bucket}/codes/{build_parameters['processing_code_file_name']}",
    depends_on = [step_process],
    job_arguments = ["--train_data_location", "/opt/ml/processing/input/data", "--feature_selection_file_location", 
                     "/opt/ml/processing/input/feature_selection", "--target_column", "Churn", "--stop_split", "Y"]
)


### Hyperparameter Tuning

In [12]:
default_objective_metric_name = build_parameters["objective_metric"]
objective_metric_name = ParameterString(name = "ObjectiveMetric", default_value = default_objective_metric_name)
metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy:([0-9\\.]+)"}]

In [13]:
from sagemaker import image_uris
sklearn_image_uri = image_uris.retrieve(framework='sklearn', region=region, version='0.23-1')

In [14]:
from sagemaker.sklearn import SKLearn
from sagemaker.tuner import ContinuousParameter, IntegerParameter, CategoricalParameter, HyperparameterTuner, WarmStartConfig, WarmStartTypes
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TuningStep

duplicate_objective_metric_name = ParameterString(name = "ObjectiveMetric", default_value = default_objective_metric_name)

n_models = build_parameters["number_of_models"]
tuning_steps = []
for i in range(n_models):
    model_details = build_parameters["model_specifications"][f"model{i}"]
    if model_details["model_type"] == 'sklearn_model':
        estimator = SKLearn(source_dir = f"s3://{pipeline_input_bucket}/codes/{model_details['model_name']}.tar.gz", 
                            entry_point = model_details["entry_point"], 
                            dependencies = model_details["dependencies"], 
                            instance_type = model_details["instance_type"], 
                            framework_version = '0.20.0', 
                            output_path = f"{hptune_training_output_path}/{model_details['model_name']}",
                            image_uri = sklearn_image_uri, role = role
                            )
        
        hyperparameters = model_details["hyperparameters"].keys()
        hyperparameter_ranges = {}
        for hyperparameter in hyperparameters:
            if model_details["hyperparameters"][hyperparameter]["type"] == "categorical":
                hyperparameter_ranges[hyperparameter] = CategoricalParameter(model_details["hyperparameters"][hyperparameter]["values"])
            elif model_details["hyperparameters"][hyperparameter]["type"] == "integer":
                hyperparameter_ranges[hyperparameter] = IntegerParameter(min_value = model_details["hyperparameters"][hyperparameter]["min_value"],
                                                                         max_value = model_details["hyperparameters"][hyperparameter]["max_value"])
        
        hyperparameter_ranges["objective_metric"] = CategoricalParameter([objective_metric_name, "anything"])
        tuner = HyperparameterTuner(
            estimator,
            objective_metric_name,
            hyperparameter_ranges,
            metric_definitions,
            max_jobs=1,
            max_parallel_jobs=1,
            strategy = model_details["tuning_strategy"],
            base_tuning_job_name = model_details["model_name"],
            # base_tuning_job_name=f"Decision_Tree_{strftime('%Y%m%d-%H-%M-%S', gmtime())}"
            )
        # print(f"HPTuning-{model_details['model_name']}")
        step_tuning = TuningStep(
            name = f"hptuning-{model_details['model_name']}",
            tuner = tuner,
            inputs={
                "train": TrainingInput(
                    s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
                    content_type="text/csv",
                ),
                "test": TrainingInput(
                    s3_data=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
                    content_type="text/csv",
                ),
            }
        )
        tuning_steps.append(step_tuning)
        

### Getting the best model from each hyperparameter tuning job

In [15]:
from sagemaker.model import Model
from sagemaker.inputs import CreateModelInput
inputs = CreateModelInput(
    instance_type=build_parameters["evaluation_instance_type"],
    # accelerator_type="ml.eia1.medium",
)
from sagemaker.workflow.steps import CreateModelStep

create_best_model_steps = []
entry_point='SageMaker_Pipeline_Component_Codes/Training/Evaluation.py',
for i in range(n_models):
    tuning_step_best_model = Model(image_uri = tuning_steps[i].tuner.estimator.image_uri, 
                                   source_dir = f"s3://{pipeline_input_bucket}/codes/evaluation.tar.gz",
                                   # source_dir = build_parameters["single_model_evluation_source_dir"],
                                   entry_point = build_parameters["single_model_evluation_entry_point"],
                                   model_data = sagemaker.workflow.functions.Join(on='/', values=[hptune_training_output_path, tuning_steps[i].name[9:], tuning_steps[i].properties.BestTrainingJob.TrainingJobName, "output/model.tar.gz"]), 
                                   role = role,
                                   sagemaker_session = sagemaker_session
                                  )
    
    step_create_best_model = CreateModelStep(
        name = f"Getting-Best-{tuning_steps[i].name[9:]}-Model",
        model = tuning_step_best_model,
        inputs = inputs
    )
    
    create_best_model_steps.append(step_create_best_model)

### Evaluating the best models from each hyperparameter tuning job

In [16]:
from sagemaker.transformer import Transformer
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep
from sagemaker.workflow.properties import PropertyFile

evaluation_steps = []

for i in range(n_models):
    transformer_dt = Transformer(
        model_name = create_best_model_steps[i].properties.ModelName,
        instance_type = build_parameters["evaluation_instance_type"],
        instance_count=1,
        output_path=f"{hptune_training_output_path}/{tuning_steps[i].name[9:]}/BestModel",
        base_transform_job_name = f"{usecase}-evaluation-{tuning_steps[i].name[9:]}",
        env = {"MODELS3LOCATION":create_best_model_steps[i].properties.PrimaryContainer.ModelDataUrl, 
               "MODELNAME":build_parameters["model_specifications"][f"model{i}"]["model_name"]}
    )
    evaluation_step = TransformStep(
        name=f"Evaluating-Best-{tuning_steps[i].name[9:]}-Model",
        transformer=transformer_dt,
        inputs=TransformInput(data=step_process_evaluation.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri, 
                              # data_type = "text/csv"
                             )
    )
    evaluation_steps.append(evaluation_step)

### Getting the best model based on model performance metric on evaluation data

In [17]:
inputs = []
for i in range(n_models):
    inputs.append(ProcessingInput(sagemaker.workflow.functions.Join(on='/', values=[evaluation_steps[i].properties.TransformOutput.S3OutputPath, "evaluation.csv.out"]), destination=f"/opt/ml/processing/input/model{i}"))
# inputs = inputs + [ProcessingInput(sagemaker.workflow.functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", "Model_Performance_Metrics.csv"]), destination=f"/opt/ml/processing/metrics")]
inputs = inputs + [ProcessingInput(source = sagemaker.workflow.functions.Join(on='/', values=["s3:/", pipeline_input_bucket, "codes", "preprocessing_requirements.txt"]), destination = "/opt/ml/processing/input/requirements")]
# inputs = inputs + [ProcessingInput(source = sagemaker.workflow.functions.Join(on='/', values=["s3:/", pipeline_input_bucket, "codes", "preprocessing_requirements.txt"]), destination = "/opt/ml/processing/input/")]

In [18]:
from sagemaker.workflow.properties import PropertyFile

property_file = PropertyFile(
    name="property_file",
    output_name="property_file",
    path="property_file.json"
)

step_get_best_model = ProcessingStep(
    name = "Getting-Best-Model",
    description = "Picking the best model based on the metric value calculated using evaluation data",
    processor = sklearn_processor,
    inputs=inputs,
    outputs=[
        ProcessingOutput(output_name="final_model", source = "/opt/ml/processing/final_model", destination = evaluation_output_path),
        ProcessingOutput(output_name="logs", source = "/opt/ml/processing/logs", destination = evaluation_output_path),
        ProcessingOutput(output_name="Metrics", source = "/opt/ml/processing/metrics_folder", 
                         destination = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output"])
                        ),
                         # f"s3://{pipeline_output_bucket}/Training_Pipeline_Output/")
        ProcessingOutput(output_name="Feature_Importance", source = "/opt/ml/processing/feature_importance", 
                         destination = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output"])
                        ),
        ProcessingOutput(output_name="Confusion_Matrix", source = "/opt/ml/processing/confusion_matrix", 
                         destination = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output"])
                        ),
        ProcessingOutput(output_name="Combined_Dashboard_Data", source = "/opt/ml/processing/Combined", 
                         destination = functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output"])
                        ),
        ProcessingOutput(output_name="property_file", source = "/opt/ml/processing/evaluation", destination = evaluation_output_path)
    ],
    # code="SageMaker_Pipeline_Component_Codes/Training/Final_Model_Selection.py",
    code = f"s3://{pipeline_input_bucket}/codes/{build_parameters['get_best_model_code_file_name']}",
    # depends_on = [step_dt_evaluation, step_lr_evaluation],
    depends_on = evaluation_steps,
    job_arguments = ["--input_folder", "/opt/ml/processing/input", "--final_model_location", "/opt/ml/processing/final_model", 
                     "--logs_location", "/opt/ml/processing/logs", 
                     # "--model_metric_input_location", "/opt/ml/processing/metrics", 
                     "--model_metric_input_location", sagemaker.workflow.functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", "Model_Performance_Metrics.csv"]),
                     "--model_metric_output_location", "/opt/ml/processing/metrics_folder", "--objective_metric", objective_metric_name, 
                     "--property_file_location", "/opt/ml/processing/evaluation", 
                     "--feature_importance_input_file_location", sagemaker.workflow.functions.Join(on='/', values=["s3:/", pipeline_output_bucket, "Training_Pipeline_Output", "Feature_Importance.csv"]),
                     "--feature_importance_output_file_location", "/opt/ml/processing/feature_importance"
                    ],
    property_files=[property_file]
)

### Register best model in SageMaker model registry

In [19]:
from sagemaker.workflow.step_collections import RegisterModel

register_best_model_steps = []

for i in range(n_models):
    model_details = build_parameters["model_specifications"][f"model{i}"]
    if model_details["model_type"] == 'sklearn_model':
        estimator = SKLearn(entry_point = "", 
                            
                            instance_type = model_details["instance_type"],
                            framework_version = '0.20.0', 
                            image_uri = sklearn_image_uri,
                            
                            role = role
                            )
        register_best_model_step = RegisterModel(name=f"RegisterBest{model_details['model_name']}Model", 
                              estimator = estimator, 
                              # model_data=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_path),
                              model_data=sagemaker.workflow.functions.Join(on='/', values=[step_get_best_model.properties.ProcessingOutputConfig.Outputs["final_model"].S3Output.S3Uri, "model.tar.gz"]),
                              content_types=["text/csv"],
                              response_types=["text/csv"],
                              inference_instances=[model_details["instance_type"]],
                              transform_instances=[model_details["instance_type"]],
                              model_package_group_name = build_parameters["model_package_group_name"],
                              image_uri = sklearn_image_uri,
                              # approval_status="Approved",
                              role=role,
                              depends_on = []
                             )
        register_best_model_steps.append(register_best_model_step)

In [None]:
from sagemaker.workflow.conditions import ConditionEquals
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet


condition_steps = []
for i in range(n_models):
    model_details = build_parameters["model_specifications"][f"model{i}"]
    condition_equal = ConditionEquals(left = JsonGet(step_name=step_get_best_model.name, 
                                                   property_file=property_file, 
                                                   json_path="best_model_name"),
                                      right = model_details["model_name"]
                                     )
    step_cond = ConditionStep(
        name=f"Is-{model_details['model_name']}-Best-Model",
        conditions=[condition_equal],
        if_steps = [register_best_model_steps[i]],
        )
    condition_steps.append(step_cond)

### Model Given

In [None]:
# if build_parameters["given_model_type"] == "sklearn":
#     estimator = SKLearn(entry_point = "", 
                        
#                         instance_type = build_parameters["scoring_instance_type"],
#                         framework_version = '0.20.0', 
#                         image_uri = sklearn_image_uri,
                        
#                         role = role
#                         )
    
#     register_given_model_step = RegisterModel(name=f"RegisterGivenModel", 
#                                              estimator = estimator, 
#                                              # model_data=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_path),
#                                              model_data=build_parameters["given_model_path"],
#                                              content_types=["text/csv"],
#                                              response_types=["text/csv"],
#                                              inference_instances=[build_parameters["scoring_instance_type"]],
#                                              transform_instances=[build_parameters["scoring_instance_type"]],
#                                              model_package_group_name = build_parameters["model_package_group_name"],
#                                              image_uri = sklearn_image_uri,
#                                              approval_status="Approved",
#                                              role=role,
#                                              depends_on = []
#                                             )

In [None]:
# model_given_condition = ConditionEquals(left = build_parameters["model_given"],
#                                         right = "No"
#                                        )
# step_model_given_cond = ConditionStep(
#     name=f"Is-Model-Given",
#     conditions=[model_given_condition],
#     # if_steps = [register_given_model_step],
#     if_steps = [step_process] + tuning_steps + [step_process_evaluation] + create_best_model_steps + evaluation_steps + [step_get_best_model] + condition_steps,
#     else_steps = [register_given_model_step]
# )

### Arranging the steps inside pipeline

In [25]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"{usecase}-training"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        train_data,
        test_data,
        evaluation_data,
        feature_selection_file,
        pipeline_output_bucket,
        #model_given,
        #model_path,
#         pipeline_output_path,
        # processing_instance_count,
        objective_metric_name,
        # processing_code_location,
        #training_instance_type,
        #evaluation_instance_type,
        processing_instance_type,
        training_instance_type
    ],
#     steps=[step_process, step_process_evaluation, step_tuning_dt, step_tuning_lr, step_create_best_dt_model, step_cond],
#     steps=[step_process_evaluation, step_process, step_tuning_dt, step_tuning_lr, step_create_best_dt_model, step_create_best_lr_model, step_dt_evaluation, step_lr_evaluation, step_cond]
#     steps=[step_process, step_tuning_dt, step_process_evaluation, step_tuning_lr, step_create_best_dt_model, step_create_best_lr_model, step_dt_evaluation, step_lr_evaluation, step_get_best_model, step_register_best_model]
#     steps = [step_cond]
    steps = [step_process] + tuning_steps + [step_process_evaluation] + create_best_model_steps + evaluation_steps + [step_get_best_model] + condition_steps
#     steps = [step_model_given_cond]
)

In [16]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"{build_parameters['usecase']}-training"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        train_data,
        test_data,
        evaluation_data,
        feature_selection_file,
        #model_given,
        #model_path,
#         pipeline_output_path,
        # processing_instance_count,
#         objective_metric_name,
        # processing_code_location,
        #training_instance_type,
        #evaluation_instance_type,
        processing_instance_type,
#         training_instance_type
    ],
#     steps=[step_process, step_process_evaluation, step_tuning_dt, step_tuning_lr, step_create_best_dt_model, step_cond],
#     steps=[step_process_evaluation, step_process, step_tuning_dt, step_tuning_lr, step_create_best_dt_model, step_create_best_lr_model, step_dt_evaluation, step_lr_evaluation, step_cond]
#     steps=[step_process, step_tuning_dt, step_process_evaluation, step_tuning_lr, step_create_best_dt_model, step_create_best_lr_model, step_dt_evaluation, step_lr_evaluation, step_get_best_model, step_register_best_model]
#     steps = [step_cond]
    steps = [step_process] 
)

### Uploading the pipeline

In [17]:
pipeline.upsert(role_arn=role)
# execution = pipeline.start()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:852619674999:pipeline/churn-training',
 'ResponseMetadata': {'RequestId': '1ff119ca-bd8d-4bf1-8932-48d81c242ffa',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1ff119ca-bd8d-4bf1-8932-48d81c242ffa',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '82',
   'date': 'Tue, 02 May 2023 17:20:39 GMT'},
  'RetryAttempts': 0}}

In [19]:
# execution = pipeline.start()