# Sklearn Model Training 

In [1]:
import boto3
import numpy as np
import pandas as pd
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline_context import PipelineSession
import time

boto3_session = boto3.Session()
# profile_name = "default"
# boto3_session = boto3.Session(profile_name=profile_name)

sagemaker_session = PipelineSession(boto_session=boto3_session)
region = sagemaker_session.boto_region_name
role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [2]:
# S3 prefix
model_name = "dedup" 
pipeline_name = "DedupTrainingPipeline"
model_package_group_name = f"dedup-model-package-group-gradient"

bucket = "mobi.source.data.batch.dev"
prefix = "dedup-findmatches"

FRAMEWORK_VERSION = "1.2-1"
script_path = "train.py"

train_input = f's3://{bucket}/{prefix}/sagemaker/input_data/train.csv'

## Create Model

Use the SKLearn estimator from SageMaker SDK to run training script

In [3]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="PT24H")

# Define the SKLearn estimator
estimator = SKLearn(
    entry_point=script_path,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
  #  hyperparameters={"max_leaf_nodes": 30},
)

# Create the training step
step_train = TrainingStep(
    cache_config=cache_config,
    name=f"{model_name}_train",
    estimator=estimator,
    inputs={"train": train_input},
)



In [4]:
!rm -rf inference_code/inference.tar.gz
!(cd inference_code/ && tar -czf inference.tar.gz *)

In [6]:
entry_point_dedup_model = "inference.py"

inference_tar = sagemaker.s3.S3Uploader.upload(
    local_path="inference_code/inference.tar.gz",
    desired_s3_uri=f's3://{bucket}/{prefix}/sagemaker/input_data',
    sagemaker_session=sagemaker_session,
)

In [7]:
print(f"tar uploaded to {inference_tar}")

tar uploaded to s3://mobi.source.data.batch.dev/dedup-findmatches/sagemaker/input_data/inference.tar.gz


In [8]:
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.workflow.model_step import ModelStep

sagemaker_bucket = "sagemaker-us-east-1-769026081114"
model_bucket = "sagemaker-scikit-learn-2024-01-03-20-35-22-490"
model_s3_path = f"s3://{sagemaker_bucket}/{model_bucket}/output/model.tar.gz"

model = SKLearnModel(model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                     role=role,
                     entry_point="inference.py",
                     sagemaker_session=sagemaker_session,
                     source_dir=inference_tar,
                     framework_version=FRAMEWORK_VERSION,
                     py_version="py3",)

In [9]:
# client = boto3.client("sagemaker")

# model_package_group_input_dict = {
#     "ModelPackageGroupName": model_package_group_name,
#     "ModelPackageGroupDescription": "Dedup sklearn model package group",
# }

# create_model_pacakge_group_response = client.create_model_package_group(
#     **model_package_group_input_dict
# )
# model_package_arn = create_model_pacakge_group_response["ModelPackageGroupArn"]
# print(f"ModelPackageGroup Arn : {model_package_arn}")

In [10]:
register_model_step_args = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=[
        "ml.m5.xlarge",
        "ml.m5.2xlarge",
        "ml.m5.4xlarge",
    ],
    transform_instances=[
        "ml.m5.xlarge",
        "ml.m5.2xlarge",
        "ml.m5.4xlarge",
    ],
    model_package_group_name=model_package_group_name,
    approval_status="Approved",
)



In [11]:
register_model_step = ModelStep(
    name=f"{model_name}_register", step_args=register_model_step_args
)

In [12]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    sagemaker_session=sagemaker_session,
    name=pipeline_name,
    steps=[step_train, register_model_step],
)

In [13]:
pipeline.upsert(role_arn=role)



Using provided s3_resource




Using provided s3_resource


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:769026081114:pipeline/DedupTrainingPipeline',
 'ResponseMetadata': {'RequestId': '616340a8-79f0-4540-b92c-6c3c73c9b1c4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '616340a8-79f0-4540-b92c-6c3c73c9b1c4',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '89',
   'date': 'Sun, 28 Jan 2024 04:18:49 GMT'},
  'RetryAttempts': 0}}

In [14]:
#pipeline.describe()

In [15]:
execution = pipeline.start()

In [16]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:769026081114:pipeline/DedupTrainingPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:769026081114:pipeline/DedupTrainingPipeline/execution/fcnhut4svgiu',
 'PipelineExecutionDisplayName': 'execution-1706415530334',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 1, 28, 4, 18, 50, 239000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 1, 28, 4, 18, 50, 239000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:769026081114:user-profile/d-7ozigerxo8wq/likun',
  'UserProfileName': 'likun',
  'DomainId': 'd-7ozigerxo8wq'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:769026081114:user-profile/d-7ozigerxo8wq/likun',
  'UserProfileName': 'likun',
  'DomainId': 'd-7ozigerxo8wq'},
 'ResponseMetadata': {'RequestId': '99f906e4-448a-4e05-a544-12c68c7bfb6f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '99f906e4-448a-4e0

In [17]:
execution.wait()

# check results

In [18]:
#list_model_packages_response = client.list_model_packages(ModelPackageGroupName=model_package_group_name)
#model_package_group_name = "ner-food-model-package-group"

client = boto3.client("sagemaker")
model_packages = client.list_model_packages(
                ModelPackageGroupName=model_package_group_name,
                SortBy="CreationTime",
                SortOrder="Descending",
            )

In [19]:
model_package = [
                pk
                for pk in model_packages["ModelPackageSummaryList"]
                if pk["ModelApprovalStatus"] == "Approved"
            ][0]

In [20]:
model_package

{'ModelPackageGroupName': 'dedup-model-package-group-gradient',
 'ModelPackageVersion': 1,
 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:769026081114:model-package/dedup-model-package-group-gradient/1',
 'CreationTime': datetime.datetime(2024, 1, 28, 4, 22, 15, 607000, tzinfo=tzlocal()),
 'ModelPackageStatus': 'Completed',
 'ModelApprovalStatus': 'Approved'}

In [21]:
model_package_arn = model_package["ModelPackageArn"]
model_package_arn

'arn:aws:sagemaker:us-east-1:769026081114:model-package/dedup-model-package-group-gradient/1'

In [23]:
# Create the model
model_name = "dedup-model-name-gradient"
create_model_response = client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        'ModelPackageName': model_package_arn,
    }
)

In [24]:
models = client.search(
                Resource="Model",
                SearchExpression={
                    "Filters": [
                        {
                            "Name": "Model.Containers.ModelPackageName",
                            "Operator": "Equals",
                            "Value": model_package_arn,
                        },
                    ]
                },
            )["Results"]

models

[{'Model': {'Model': {'ModelName': 'dedup-model-package-group-gradient-2024-01-28-04-26-16-079',
    'Containers': [{'Mode': 'SingleModel',
      'ModelPackageName': 'arn:aws:sagemaker:us-east-1:769026081114:model-package/dedup-model-package-group-gradient/1'}],
    'ExecutionRoleArn': 'arn:aws:iam::769026081114:role/SageMaker',
    'CreationTime': datetime.datetime(2024, 1, 28, 4, 26, 16, tzinfo=tzlocal()),
    'ModelArn': 'arn:aws:sagemaker:us-east-1:769026081114:model/dedup-model-package-group-gradient-2024-01-28-04-26-16-079',
    'EnableNetworkIsolation': False},
   'LastBatchTransformJob': {'TransformJobName': 'dedup-model-package-group-gradient-2024-01-28-04-26-16-891',
    'TransformJobArn': 'arn:aws:sagemaker:us-east-1:769026081114:transform-job/dedup-model-package-group-gradient-2024-01-28-04-26-16-891',
    'TransformJobStatus': 'InProgress',
    'ModelName': 'dedup-model-package-group-gradient-2024-01-28-04-26-16-079',
    'MaxConcurrentTransforms': 4,
    'MaxPayloadInMB':