# SM09: Train Pre-built Model

Note: The target column should be the first column.

In [18]:
import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.functions import Join
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.estimator import Estimator
from sagemaker.workflow.pipeline import Pipeline

session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = '1_ins_dataset'

pipeline_name = "InsExample"  # SageMaker Pipeline name
model_package_group_name = "Insurance Co Example"  # Model name in model registry
framework_version = "0.23-1"

train_uri = f's3://{bucket}/{prefix}/final/train/train_feats.csv'
validate_uri = f's3://{bucket}/{prefix}/final/validate/validate_feats.csv'



# tags = [
#     {"Key": "DATASET", "Value": "InsCOIL"},
#     {"Key": "SOURCE", "Value": "UCI"}
#    ]

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.t3.medium")

training_instance_type = ParameterString(
    name="TrainingInstanceType", default_value="ml.m5.xlarge")
    
train_data = ParameterString(
    name="TrainData",
    default_value=train_uri
)
validate_data = ParameterString(
    name="ValidateData",
    default_value=validate_uri
)

model_approval_status = ParameterString(
    name='ModelApprovalStatus', default_value='PendingManualApproval')

image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-2',
    py_version='py3',
    instance_type='ml.m5.xlarge')

xgb_estimator = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    role=role,
    disable_profiler=True)

xgb_estimator.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='binary:logistic',
    num_round=25)

step_train = TrainingStep(
    name='train_model',
    estimator=xgb_estimator,
    inputs={
        'train':TrainingInput(
            s3_data=train_data,
            content_type='text/csv'),
        'validation':TrainingInput(
            s3_data=validate_data,
            content_type='text/csv')
            })

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        processing_instance_count,
        training_instance_type,
        train_data,
        validate_data
    ],
    steps=[step_train])

pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(execution_display_name="InsPrebuiltModel3")

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:707031497630:pipeline/insexample/execution/as9z4dij3nl0', sagemaker_session=<sagemaker.session.Session object at 0x7effda53fd60>)