In [None]:
import sagemaker
# Initialize SageMaker session and role
sagemaker_session = sagemaker.session.Session()
role = "arn:aws:iam::282698011778:role/service-role/AmazonSageMaker-ExecutionRole-20250701T225193"

In [None]:
from sagemaker.workflow.parameters import ParameterString, ParameterInteger

# Pipeline parameters
input_data = ParameterString(
    name="InputDataUrl", 
    default_value="s3://mlops-testing-buck"
)
processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount", 
    default_value=1
)
training_instance_count = ParameterInteger(
    name="TrainingInstanceCount", 
    default_value=1
)

In [None]:
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor

# Define processing step

# Define sklearn processor
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,  # optional, defaults to 1
    base_job_name='sklearn-preprocessing'  # optional, helps identify the processing job
)

# Define processing step
step_process = ProcessingStep(
    name="SampleProcessing",
    processor=sklearn_processor,
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=input_data,
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train"
        ),
        sagemaker.processing.ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/output/validation"
        )
    ],
    code="processing_script.py"  # This script should exist in your directory
)


In [None]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.workflow.steps import TrainingStep
from sagemaker.inputs import TrainingInput
from sagemaker.session import Session

# Initialize the SageMaker session
sagemaker_session = Session()

# Get the execution role
role = "arn:aws:iam::282698011778:role/service-role/AmazonSageMaker-ExecutionRole-20250701T225193"

# Define the training instance count
training_instance_count = 1

# Define your S3 path for training data
train_data_path = "s3://mlops-testing-bucket-kulsin/sample-data.csv"

# Define XGBoost estimator with specified version
xgboost_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve(
        "xgboost", 
        sagemaker_session.boto_region_name,
        version="1.7-1"
    ),
    role=role,
    instance_count=training_instance_count,
    instance_type='ml.m5.xlarge',
    volume_size=30,
    max_run=3600,
    input_mode='File',
    output_path=f"s3://{sagemaker_session.default_bucket()}/output"
)

# Define training step with only training data
step_train = TrainingStep(
    name="SampleTraining",
    estimator=xgboost_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=train_data_path,
            content_type="csv"
        )
    }
)

In [None]:
# Imports
import os
import json
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.parameters import ParameterString
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.inputs import TrainingInput
from sagemaker.sklearn import SKLearn
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Cell 2: Load config using ENV variable
config_file_path = os.environ.get("CONFIG_FILE_PATH", "")
if not config_file_path:
    raise ValueError("CONFIG_FILE_PATH environment variable is not set.")

# Cell 2: Load config
# Load the config from the passed path
print(f"Reading config from: {config_file_path}")
with open(config_file_path, "r") as f:
    config = json.load(f)

role = config["role"]
input_data = config["input_data"]
pipeline_name = config["pipeline_name"]
region = config["region"]
print(f"Loaded config: role={role}, input_data={input_data}, pipeline_name={pipeline_name}, region={region}")


# Cell 3: Define session and scripts
sagemaker_session = sagemaker.Session()
# Get directory of config file to resolve sibling paths
base_dir = os.path.dirname(config_file_path)

processing_script_path = os.path.join(base_dir, "..", "processing_script.py")
training_script_path = os.path.join(base_dir, "..", "train.py")

if not os.path.exists(processing_script_path):
    raise FileNotFoundError(f"Processing script not found: {processing_script_path}")
if not os.path.exists(training_script_path):
    raise FileNotFoundError(f"Training script not found: {training_script_path}")

# Cell 4: Define processing step
processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1
)

step_process = ProcessingStep(
    name="PreprocessData",
    processor=processor,
    inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test")
    ],
    code=processing_script_path
)

# Cell 5: Define training step
estimator = SKLearn(
    entry_point=training_script_path,
    framework_version="0.23-1",
    instance_type="ml.m5.large",
    instance_count=1,
    role=role
)

step_train = TrainingStep(
    name="TrainModel",
    estimator=estimator,
    inputs={
        "training": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri
        ),
        "testing": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri
        )
    }
)

# Cell 6: Define and run pipeline
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[],
    steps=[step_process, step_train],
    sagemaker_session=sagemaker_session
)

pipeline.upsert(role_arn=role)
execution = pipeline.start()
print(f"Pipeline execution started with ARN: {execution.arn}")

In [None]:
"""
import boto3
from typing import Dict, List, Optional
#role = "arn:aws:iam::282698011778:role/service-role/AmazonSageMaker-ExecutionRole-20250701T225193"
role = config["role"]
input_data = config["input_data"]
pipeline_name = config["pipeline_name"]
region = config["region"]

# New fields from config
schedule_name = config["schedule_name"]
kms_key_arn = config["kms_key_arn"]
schedule_expression = config["schedule_expression"]

def configure_sagemaker_schedule(
    schedule_name: str,
    pipeline_name: str,
    role_arn: str,
    kms_key_arn: str,
    region_name: str,
    schedule_expression: str,
    pipeline_parameters: Optional[List[Dict]] = None
) -> Dict:

    Configure a compliant EventBridge schedule for SageMaker pipeline with KMS encryption.

    Args:
        schedule_name (str): Name of the schedule
        pipeline_name (str): Name of the SageMaker pipeline
        role_arn (str): IAM role ARN for execution
        kms_key_arn (str): KMS key ARN for encryption
        region_name (str): AWS region name
        schedule_expression (str): Cron expression for scheduling
        pipeline_parameters (List[Dict], optional): Pipeline parameters

    Returns:
        Dict: Response from EventBridge Scheduler

    # Initialize AWS clients
    eventbridge = boto3.client('scheduler')
    sts = boto3.client('sts')

    # Get account ID and construct pipeline ARN
    account_id = sts.get_caller_identity()['Account']
    pipeline_arn = f"arn:aws:sagemaker:{region_name}:{account_id}:pipeline/{pipeline_name}"

    # Prepare schedule configuration
    schedule_config = {
        'Name': schedule_name,
        'FlexibleTimeWindow': {
            'Mode': 'OFF'
        },
        'ScheduleExpression': schedule_expression,
        'KmsKeyArn': kms_key_arn,
        'Target': {
            'RoleArn': role_arn,
            'Arn': pipeline_arn,
            'Input': '{}',
            'SageMakerPipelineParameters': {
                'PipelineParameterList': pipeline_parameters or []
            }
        },
        'State': 'ENABLED'
    }

    try:
        # Check if schedule exists
        try:
            existing_schedule = eventbridge.get_schedule(Name=schedule_name)
            print(f"Found existing schedule: {schedule_name}")

            # Update if KMS key is different or missing
            if ('KmsKeyArn' not in existing_schedule or 
                existing_schedule['KmsKeyArn'] != kms_key_arn):
                print("Updating schedule with compliant KMS key...")
                response = eventbridge.update_schedule(**schedule_config)
                print(f"Schedule updated with KMS key: {kms_key_arn}")
            else:
                print("Schedule already compliant with specified KMS key")
                response = existing_schedule

        except eventbridge.exceptions.ResourceNotFoundException:
            print(f"Creating new schedule: {schedule_name}")
            response = eventbridge.create_schedule(**schedule_config)
            print(f"New schedule created with KMS key: {kms_key_arn}")

        return response

    except Exception as e:
        print(f"Error configuring schedule: {str(e)}")
        raise

# Basic schedule configuration
response = configure_sagemaker_schedule(
    schedule_name=schedule_name,
    pipeline_name=pipeline_name,
    role_arn=role,
    kms_key_arn=kms_key_arn,
    region_name=region,
    schedule_expression=schedule_expression
)
"""
import boto3
import json
from typing import Dict, List, Optional

# Load values from config.json file
input_data = config["input_data"]
pipeline_name = config["pipeline_name"]
region = config["region"]

# New fields from config
schedule_name = config["schedule_name"]
schedule_expression = config["schedule_expression"]

# Fetch role_arn and cmk_key_arn from Secrets Manager
def get_scheduler_secrets(secret_name: str, region_name: str) -> Dict[str, str]:
    """
    Fetch secrets from AWS Secrets Manager.

    Args:
        secret_name (str): The name of the secret in Secrets Manager.
        region_name (str): The AWS region.

    Returns:
        Dict[str, str]: Dictionary containing role_arn and cmk_key_arn.
    """
    secret_manager = boto3.client('secretsmanager', region_name=region_name)
    response = secret_manager.get_secret_value(SecretId=secret_name)
    secret_string = response['SecretString']
    secret_data = json.loads(secret_string)

    return {
        "role_arn": secret_data.get("role_arn"),
        "cmk_key_arn": secret_data.get("cmk_key_arn")
    }

# Retrieve secrets
secret_name = "sagemaker/scheduler-iam-role-credentials"
secrets = get_scheduler_secrets(secret_name, region_name=region)

role_arn = secrets["role_arn"]
kms_key_arn = secrets["cmk_key_arn"]

print(f"Fetched Role ARN from Secrets Manager: {role_arn}")
print(f"Fetched KMS Key ARN from Secrets Manager: {kms_key_arn}")

# Configure schedule function
def configure_sagemaker_schedule(
    schedule_name: str,
    pipeline_name: str,
    role_arn: str,
    kms_key_arn: str,
    region_name: str,
    schedule_expression: str,
    pipeline_parameters: Optional[List[Dict]] = None
) -> Dict:
    """
    Configure EventBridge schedule for SageMaker pipeline with KMS encryption.
    """
    eventbridge = boto3.client('scheduler', region_name=region_name)
    sts = boto3.client('sts')
    account_id = sts.get_caller_identity()['Account']
    pipeline_arn = f"arn:aws:sagemaker:{region_name}:{account_id}:pipeline/{pipeline_name}"

    schedule_config = {
        'Name': schedule_name,
        'FlexibleTimeWindow': {'Mode': 'OFF'},
        'ScheduleExpression': schedule_expression,
        'KmsKeyArn': kms_key_arn,
        'Target': {
            'RoleArn': role_arn,
            'Arn': pipeline_arn,
            'Input': '{}',
            'SageMakerPipelineParameters': {
                'PipelineParameterList': pipeline_parameters or []
            }
        },
        'State': 'ENABLED'
    }

    try:
        try:
            existing_schedule = eventbridge.get_schedule(Name=schedule_name)
            print(f"Found existing schedule: {schedule_name}")

            if ('KmsKeyArn' not in existing_schedule or existing_schedule['KmsKeyArn'] != kms_key_arn):
                print("Updating schedule with compliant KMS key...")
                response = eventbridge.update_schedule(**schedule_config)
                print(f"Schedule updated with KMS key: {kms_key_arn}")
            else:
                print("Schedule already compliant")
                response = existing_schedule

        except eventbridge.exceptions.ResourceNotFoundException:
            print(f"Creating new schedule: {schedule_name}")
            response = eventbridge.create_schedule(**schedule_config)
            print(f"New schedule created with KMS key: {kms_key_arn}")

        return response

    except Exception as e:
        print(f"Error configuring schedule: {str(e)}")
        raise

# Call function with values from config.json + secrets
response = configure_sagemaker_schedule(
    schedule_name=schedule_name,
    pipeline_name=pipeline_name,
    role_arn=role_arn,
    kms_key_arn=kms_key_arn,
    region_name=region,
    schedule_expression=schedule_expression
)
