In [None]:
!pip install s3fs
!pip install sagemaker

#### Introduction
This notebook outlines a complete SageMaker pipeline for building, training, and registering a machine learning model using the Scikit-learn framework and SageMaker's Linear Learner algorithm. Here’s a summary of its components and flow:

#### Initialization
Initializes a SageMaker session and the role required to execute the pipeline.
Defines pipeline parameters (S3 data paths, instance type) for flexibility in configurations.

#### Data Processing

Uses SKLearnProcessor to preprocess data (e.g., test-data.csv). A ProcessingStep is defined to execute the preprocessing with outputs for training and validation datasets.
Data is processed and stored in S3 for further use.

#### Model Training

Defines a TrainingStep to train a binary classification model using the SageMaker Linear Learner algorithm.
The training step takes in processed training and validation data from the previous step.

#### Model Registration

Registers the trained model in the SageMaker model registry using RegisterModel. This makes the model available for deployment and further evaluations.

#### Pipeline Definition

Combines the steps (data processing, training, and model registration) into a single Pipeline. The pipeline is created or updated and executed using SageMaker's pipeline capabilities.

#### Model Deployment

Uses the SageMaker client to find and retrieve the latest model package from the model registry and deploys to the Endpoints.

#### Conclusion
The pipeline automates the entire machine learning workflow from preprocessing, training, model registration, and preparation for deployment.

# import libraries

In [None]:
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.model import ModelPackage


sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)

region = boto3.Session().region_name

pipeline_name = "linear-linear"  # SageMaker Pipeline name

# Define bucket, prefix, role


# SageMaker session
sagemaker_session = sagemaker.Session()

# Define a Processing Step for Feature Engineering <a class="anchor" id="training"></a>


In [None]:
%%writefile preprocessing.py

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Define input and output paths based on environment variables
input_data_path = os.path.join('/opt/ml/processing/input', 'test-data.csv')  # S3 input gets mounted here

output_train_path = os.path.join('/opt/ml/processing/output/train', 'train.csv')
output_validation_path = os.path.join('/opt/ml/processing/output/validation', 'validation.csv')

def preprocess_data(data):
    # Drop rows with missing values
    data = data.dropna()        
        
    # Encode categorical features
    le_gender = LabelEncoder()
    le_cancer_type = LabelEncoder()
    data['outcome'] = data['outcome'].apply(lambda x: 1 if x == 'survived' else 0)
    data['gender'] = le_gender.fit_transform(data['gender'])
    data['cancer_type'] = le_cancer_type.fit_transform(data['cancer_type'])
    
    

    # Split data into features and labels if needed
    if 'outcome' in data.columns:
        X = data.drop('outcome', axis=1)
        y = data['outcome']
    else:
        X, y = data, None

    return X, y

if __name__ == "__main__":
    # Load the dataset
    print("Reading input data from:", input_data_path)
    data = pd.read_csv(input_data_path)

    # Preprocess the data
    print("Preprocessing data...")
    X, y = preprocess_data(data)
    
    # Scale the numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
    
    # Convert to CSV format and save locally
    # Ensure the label is the first column in the dataset
    train_data = pd.concat([y_train.reset_index(drop=True), pd.DataFrame(X_train)], axis=1)
    print('train_data', train_data.head(2))
    validation_data = pd.concat([y_test.reset_index(drop=True), pd.DataFrame(X_test)], axis=1)
    print('validation_data', validation_data.head(2))
    
    print("Saving train and validation data", output_train_path, ' + ', output_validation_path)

    # Save the datasets as CSV files without headers
    train_data.to_csv(output_train_path, index=False, header=False)
    validation_data.to_csv(output_validation_path, index=False, header=False)




In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import CreateModelStep, TransformStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker import Model
from sagemaker.inputs import CreateModelInput
from sagemaker import ModelPackage


# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()


# Define Pipeline Parameters (optional for flexibility)
train_data_param = ParameterString(
    name='TrainData',
    default_value=f's3://{bucket}/{prefix}/input/test-data.csv'
)
output_prefix_param = ParameterString(
    name='OutputPrefix',
    default_value=f's3://{bucket}/{prefix}/processed/'
)
instance_type_param = ParameterString(
    name='InstanceType',
    default_value='ml.m5.xlarge'
)

# Define the Scikit-learn processor for data preprocessing
sklearn_processor = SKLearnProcessor(
    framework_version='1.0-1',  # Adjust based on your sklearn version
    role=role,
    instance_type=instance_type_param,
    instance_count=1,
    sagemaker_session=sagemaker_session
)

# Define the ProcessingStep
processing_step = ProcessingStep(
    name="DataProcessingStep",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source=train_data_param,  # S3 path to test-data.csv
            destination='/opt/ml/processing/input'  # Container path
        )
    ],
    outputs=[
        ProcessingOutput(
            source='/opt/ml/processing/output/train/',  # Container path for train.csv
            destination=f's3://{bucket}/{prefix}/processed/train/',  # S3 path
            output_name='train_data'  # This is the name of the output
        ),
        ProcessingOutput(
            source='/opt/ml/processing/output/validation/',  # Container path for validation.csv
            destination=f's3://{bucket}/{prefix}/processed/validation/',  # S3 path
            output_name='validation_data'  # This is the name of the validation output
        )
    ],
    code="preprocessing.py"  # Ensure this script is in your working directory
)


# Define the Linear Learner Estimator
linear_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve('linear-learner', boto3.Session().region_name),
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size=30,  # in GB
    max_run=3600,  # in seconds
    output_path=f's3://{bucket}/{prefix}/output/',
    sagemaker_session=sagemaker_session
)


# Set hyperparameters for the Linear Learner
linear_estimator.set_hyperparameters(
    predictor_type='binary_classifier',
    mini_batch_size=10
)

# Define TrainingStep using the correct outputs from ProcessingStep
training_step = TrainingStep(
    name="TrainLinearLearnerModel",
    estimator=linear_estimator,
    inputs={
        'train': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri,  # Correct reference to 'train_data'
            content_type='text/csv'
        ),
        'validation': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["validation_data"].S3Output.S3Uri,  # Correct reference to 'validation_data'
            content_type='text/csv'
        )
    }
)


# (Optional) Define Model Registration Step
model_register_step = RegisterModel(
    name="RegisterLinearLearnerModel",
    estimator=linear_estimator,
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.large"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=MODEL_PACKAGE_GROUP_NAME
)


# Define the Pipeline with steps in sequence
pipeline = Pipeline(
    name="LinearLearnerPipeline",
    steps=[processing_step, training_step, model_register_step],
    parameters=[
        train_data_param,
        output_prefix_param,
        instance_type_param
    ],
    sagemaker_session=sagemaker_session
)


# Create or update the pipeline
pipeline.upsert(role_arn=role)

# Execute the pipeline
pipeline_execution = pipeline.start()

print(f"Pipeline execution started with execution ARN: {pipeline_execution.arn")

# Wait for the pipeline execution to complete before deploying the endpoint
pipeline_execution.wait()




# Deploy the latest model to Endpoint

In [None]:

# Initialize SageMaker client
sm_client = boto3.client('sagemaker')

# List model packages in the group and sort by creation time to get the latest version
response = sm_client.list_model_packages(
    ModelPackageGroupName=MODEL_PACKAGE_GROUP_NAME,
    SortBy='CreationTime',
    SortOrder='Descending',
    MaxResults=1
)

# Get the latest ModelPackageArn
latest_model_package_arn = response['ModelPackageSummaryList'][0]['ModelPackageArn']

print(f"Latest Model Package ARN: {latest_model_package_arn}")


In [None]:

# Create a ModelPackage object from the ModelPackageArn
model = ModelPackage(
    role=role,
    model_package_arn=latest_model_package_arn,
    sagemaker_session=sagemaker_session
)

# Deploy the model to a SageMaker endpoint
version_number = latest_model_package_arn.split('/')[-1]
endpoint_name = f"linear-learner-endpoint-{version_number}"  # Choose your desired endpoint name
model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",  # Adjust the instance type as needed
    endpoint_name=endpoint_name
)

# Print the deployed endpoint name
print(f"Model deployed to endpoint: {endpoint_name}")