In [1]:
import boto3
import sagemaker

region = boto3.Session().region_name
default_bucket = "demo-bucket-test-mlop"
sagemaker_session = sagemaker.session.Session(default_bucket = default_bucket)
role = sagemaker.get_execution_role()

print(role)
print(sagemaker_session)

arn:aws:iam::852619674999:role/service-role/AmazonSageMaker-ExecutionRole-20220427T124311
<sagemaker.session.Session object at 0x7f9ae00f0a20>


In [2]:
input_train_data_uri="s3://demo-bucket-test-mlop/Churn_Demo/churn-bigml-80.csv"
input_test_data_uri="s3://demo-bucket-test-mlop/Churn_Demo/churn-bigml-20.csv"

from time import gmtime, strftime
output_path = f"s3://{default_bucket}/ChurnTrain/" + strftime("%Y%m%d-%H-%M-%S", gmtime())
print(output_path)


from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)


train_data = ParameterString(name="TrainData", default_value=input_train_data_uri)

test_data = ParameterString(name="TestData", default_value=input_test_data_uri)

pipeline_output_path = ParameterString(name="OutputPath", default_value=output_path)

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", 
                                             default_value=1)

processing_instance_type = ParameterString(name="ProcessingInstanceType", 
                                           default_value="ml.m5.4xlarge")

training_instance_type = ParameterString(name="TrainingInstanceType",
                                         default_value="ml.m5.4xlarge")

s3://demo-bucket-test-mlop/ChurnTrain/20220522-05-23-25


In [3]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="Churn-Preprocessing",
    role=role
)

In [4]:

from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TuningStep
    

step_process = ProcessingStep(
    name="Preprocessing-Step",
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=train_data, destination="/opt/ml/processing/input"),  
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test")
    ],
    code="SageMaker_Pipeline_Component_Codes/Preprocessing.py",
)



In [5]:
from sagemaker.sklearn import SKLearn
sklearn_estimator_lr = SKLearn('SageMaker_Pipeline_Component_Codes/Logistic_Regression_Training.py',
                            dependencies = ["Requirements/training_requirements.txt"],
                            instance_type='ml.m4.xlarge',
                            framework_version='0.20.0',
                            role = role,
                            output_path = pipeline_output_path
                            # hyperparameters = {'max_depth': 3, 'random_state': 1}
                           )

In [6]:
from sagemaker.tuner import (
    ContinuousParameter,
    IntegerParameter,
    CategoricalParameter,
    HyperparameterTuner,
    WarmStartConfig,
    WarmStartTypes
)

    
objective_metric_name_lr = "accuracy"
metric_definitions_lr = [{"Name": "accuracy", "Regex": "accuracy:([0-9\\.]+)"}]
hyperparameter_ranges_lr = {
    "penalty": CategoricalParameter(['l1','l2']),
    "C": ContinuousParameter(min_value = 0.001, max_value = 1000.0, scaling_type = "Logarithmic"),
    "solver": CategoricalParameter(["liblinear", "newton-cg", "lbfgs", "sag", "saga"])
}

tuner_log_lr = HyperparameterTuner(
    sklearn_estimator_lr,
    objective_metric_name_lr,
    hyperparameter_ranges_lr,
    metric_definitions_lr,
    max_jobs=4,
    max_parallel_jobs=4,
    strategy="Random"
)



from sagemaker.inputs import TrainingInput


step_tuning_lr = TuningStep(
    name="HPTuning-Logistic-Regression",
    tuner=tuner_log_lr,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "test": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "test"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)

In [7]:
sklearn_estimator_dt = SKLearn('SageMaker_Pipeline_Component_Codes/DecisionTree_Training.py',
                            dependencies = ["Requirements/training_requirements.txt"],
                            instance_type='ml.m4.xlarge',
                            framework_version='0.20.0',
                            role = role,
                            output_path = pipeline_output_path,
                            hyperparameters = {'max_depth': 3, 'random_state': 1}
                           )

In [8]:

objective_metric_name_dt = "accuracy"
metric_definitions_dt = [{"Name": "accuracy", "Regex": "accuracy:([0-9\\.]+)"}]
hyperparameter_ranges_dt = {
    "criterion": CategoricalParameter(['gini', 'entropy']),
    "max_depth": IntegerParameter(min_value = 2, max_value = 12),
    "min_samples_leaf": IntegerParameter(min_value = 1, max_value = 5)
}

tuner_log_dt = HyperparameterTuner(
    sklearn_estimator_dt,
    objective_metric_name_dt,
    hyperparameter_ranges_dt,
    metric_definitions_dt,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random"
)



from sagemaker.inputs import TrainingInput


step_tuning_dt = TuningStep(
    name="Iris-HPTuning-DT",
    tuner=tuner_log_dt,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "test": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "test"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)


In [9]:
from sagemaker.workflow.pipeline import Pipeline


# pipeline_name = f"AbalonePipeline"
pipeline_name = f"Churn-Demo"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        train_data,
        test_data,
        pipeline_output_path,
        processing_instance_count,
        processing_instance_type,
        training_instance_type
    ],
    steps=[step_process, step_tuning_lr, step_tuning_dt],
)

pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:852619674999:pipeline/churn-demo',
 'ResponseMetadata': {'RequestId': 'bd5a036d-085a-457f-898d-c12fdfa69cdb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bd5a036d-085a-457f-898d-c12fdfa69cdb',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '78',
   'date': 'Sun, 22 May 2022 05:23:45 GMT'},
  'RetryAttempts': 0}}

In [10]:
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:852619674999:pipeline/churn-demo/execution/fk3cfumvpy25', sagemaker_session=<sagemaker.session.Session object at 0x7f9ab33399b0>)

In [2]:
import pandas

In [17]:
a = pandas.DataFrame({'A':[1,2,3,4], 'B':['A','B','C','D'], 'C':[9,10,11,12], 'D':["E", 'F', 'G',None]})

In [9]:
a

Unnamed: 0,A,B,C
0,1,A,9
1,2,B,10
2,3,C,11
3,4,D,12


In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_feats = list(a.select_dtypes('number').columns)
model=scaler.fit(a[numeric_feats])
a[numeric_feats]=model.transform(a[numeric_feats])

In [11]:
a

Unnamed: 0,A,B,C
0,0.0,A,0.0
1,0.333333,B,0.333333
2,0.666667,C,0.666667
3,1.0,D,1.0


In [19]:
a

Unnamed: 0,A,B,C,D
0,1,A,9,E
1,2,B,10,F
2,3,C,11,G
3,4,D,12,


In [20]:
pandas.get_dummies(a, columns = ['B', 'D'], drop_first = True)

Unnamed: 0,A,C,B_B,B_C,B_D,D_F,D_G
0,1,9,0,0,0,0,0
1,2,10,1,0,0,1,0
2,3,11,0,1,0,0,1
3,4,12,0,0,1,0,0
