In [1]:
%pip install sagemaker

Collecting sagemaker
  Downloading sagemaker-2.243.3-py3-none-any.whl.metadata (16 kB)
Collecting boto3<2.0,>=1.35.75 (from sagemaker)
  Downloading boto3-1.38.1-py3-none-any.whl.metadata (6.6 kB)
Collecting docker (from sagemaker)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastapi (from sagemaker)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting google-pasta (from sagemaker)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting importlib-metadata<7.0,>=1.4.0 (from sagemaker)
  Using cached importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting jsonschema (from sagemaker)
  Using cached jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting omegaconf<=2.3,>=2.2 (from sagemaker)
  Using cached omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pathos (from sagemaker)
  Downloading pathos-0.3.4-py3-none-any.whl.metadata (11 kB)
Collecting protobuf<6.0,>=3.12 (from s

In [2]:
import json
import yaml
import boto3

# import sagemaker
import sagemaker.session


from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import (
    FrameworkProcessor,
    ProcessingInput,
    ProcessingOutput)
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.steps import ProcessingStep, CacheConfig
from sagemaker.workflow.functions import Join
from sagemaker.workflow.pipeline import Pipeline



sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/julie.fisher/Library/Application Support/sagemaker/config.yaml


In [3]:
session = sagemaker.session.Session()
pipe_session = PipelineSession()
region = session.boto_region_name
role = sagemaker.get_execution_role()
account_id = session.account_id()

tags = [
    {"Key": "Test", "Value": "me"}
   ]

In [None]:
config_file = "configs/param_config_simp.yaml"
special_name = "debug"

In [None]:
with open(f"{config_file}", "r",
          encoding="utf-8") as config_file:
    config_params = yaml.safe_load(config_file)

date_ymd = config_params["date_ts"][:10]
pipeline_name = config_params["use_case"] + "Prod"
model_package_group_name = config_params["use_case"].lower() + "Prod"
model_name = model_package_group_name + "-" + config_params["date_ts"]
prod_uri = "s3://{}".format(config_params["bucket"])
model_description = (config_params["target"].split("_")[0]
                     + config_params["filter_value"].capitalize()
                     + str(config_params["day_interval"])
                     + "Day"
                     + date_ymd.replace("-","")
                    )
if special_name is not None:
    model_description = model_description + special_name
process_input_path = "/opt/ml/processing/input"
process_output_path = "/opt/ml/processing/output"

source_dir_path = "src/source_dir/mobility-" + config_params[
    "date_ts"]

pipe_prefix = ["mobility_models", model_description]

data_filepath = ["s3://{}".format(config_params["bucket"]),
                 *pipe_prefix,
                 "data"]

step_counter = 1
step_list = []

step_cache_config = CacheConfig(
    enable_caching=True,
    expire_after=config_params["step_cache_config"])

In [None]:
s3_resource = boto3.resource('s3')
s3_resource.Bucket(config_params['bucket']).upload_file('sql_queries/base_query.txt', f"s3://{config_params['bucket']}/{model_description}/sql_queries/base_query.txt")

In [None]:
query_processor = FrameworkProcessor(
    estimator_cls=SKLearn,
    framework_version=config_params["framework_version"],
    role=role,
    instance_type=config_params["instance_type_processing"],
    instance_count=config_params["instance_count_processing"],
    sagemaker_session=pipe_session,
    base_job_name="data_query")

query_list = []

In [None]:
# Query one table
# Set arguments
table = "WEB_DATA"
arguments = [
    "--table", table,
    "--input-path", process_input_path,
    "--output_path", process_output_path,
    "--db-type", "mysql",
    "--dbname", "machinelearning",
    "--folder-date", date_ymd,
    "--interval", config_params["day_interval"]
]

query_args = query_processor.run(
    code="feat_query.py",
    source_dir=source_dir_path,
    inputs=[
        ProcessingInput(
            source=Join(
                on="/",
                values=[
                    "s3://".format(config_params['bucket']),
                    model_description,
                    "sql_queries",
                    "base_query.txt"]),
            destination=process_input_path,
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="features",
            source=process_output_path,
            destination=Join(
                on="/",
                values=[
                    *data_filepath,
                    "features"])
        )
    ],
    arguments=arguments
)

query_dbs = ProcessingStep(
    name="{}-{}".format(step_counter, table),
    step_args=query_args,
    cache_config=step_cache_config
)
step_list.append(query_dbs)
query_list.append(query_dbs)

In [None]:
# Create Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    # parameters=param_list,
    steps=step_list,
    sagemaker_session=pipe_session)

with open("pipeline_definition_simp.json", "w", encoding="utf-8") as def_file:
    json.dump(json.loads(pipeline.definition()), def_file, indent=4)

# Upload Pipeline and Start Exedution

pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(
    execution_display_name=model_description)