In [None]:
!pip install s3fs

# import libraries

In [None]:
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline_context import PipelineSession

import boto3
import sagemaker
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Split data into train and test
from sklearn.model_selection import train_test_split

from sagemaker.sklearn.estimator import SKLearn

sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)

region = boto3.Session().region_name

pipeline_session = PipelineSession()

model_package_group_name = "PipelineModelPackageGroup"
pipeline_name = "serial-inference-pipeline"  # SageMaker Pipeline name



# SageMaker session
sagemaker_session = sagemaker.Session()

# import input data from S3

In [None]:

data_key = f'{prefix}/test_data.csv'

train_input_dir = f's3://{bucket}/{data_key}'

df = pd.read_csv(train_input_dir)

df.head(10)

# Define a Processing Step for Feature Engineering <a class="anchor" id="training"></a>


The below preprocessing script, in addition to creating a scaler, contains the necessary functions for it to be deployed as part of a model.



In [None]:
%%writefile preprocess_input_data.py

import os
import boto3
import argparse
import sagemaker
import pandas as pd

from sagemaker import get_execution_role
from sklearn.preprocessing import OneHotEncoder

# split data into train and test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocessing_data(input_data_dir):
    
    bucket = input_data_dir.split('/')[2]
    prefix = input_data_dir.split('/')[3]
    
    # SageMaker session
    sagemaker_session = sagemaker.Session()
    input_data = pd.read_csv(input_data_dir)
    df = input_data.copy()

    # Encode categorical features
    le_gender = LabelEncoder()
    le_cancer_type = LabelEncoder()
    df['outcome'] = df['outcome'].apply(lambda x: 1 if x == 'survived' else 0)
    df['gender'] = le_gender.fit_transform(df['gender'])
    df['cancer_type'] = le_cancer_type.fit_transform(df['cancer_type'])
    
    # Separate features and labels
    X = df.drop('outcome', axis=1)
    y = df['outcome']
    
    # Scale the numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
    
    # Convert to CSV format and save locally
    # Ensure the label is the first column in the dataset
    train_data = pd.concat([y_train.reset_index(drop=True), pd.DataFrame(X_train)], axis=1)
    test_data = pd.concat([y_test.reset_index(drop=True), pd.DataFrame(X_test)], axis=1)
    
    # Save the datasets as CSV files without headers
    train_data.to_csv('train.csv', index=False, header=False)
    test_data.to_csv('test.csv', index=False, header=False)



    # Upload the data to S3
    train_data_s3_path = sagemaker_session.upload_data(path='train.csv', bucket=bucket, key_prefix=prefix)
    print('Saved Train data', train_data_s3_path)
    test_data_s3_path = sagemaker_session.upload_data(path='test.csv', bucket=bucket, key_prefix=prefix)
    print('Saved Test data', test_data_s3_path)
    
    

    return train_data_s3_path, test_data_s3_path

    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('input_data_dir')
    args = parser.parse_args()
    preprocessing_data(args.input_data_dir)







In [None]:
! python preprocess_input_data.py 's3://mysagemakerprojects/cancer-prediction-ml-model/test_data.csv'

In [None]:
train_data_s3_path = 's3://mysagemakerprojects/cancer-prediction-ml-model/train.csv'
test_data_s3_path = 's3://mysagemakerprojects/cancer-prediction-ml-model/test.csv'

In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.amazon.amazon_estimator import get_image_uri

# Get XGBoost container image
container = get_image_uri(boto3.Session().region_name, 'xgboost', 'latest')

# Define the XGBoost model
xgboost = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/sagemaker/cancer-output',
    sagemaker_session=sagemaker_session
)

# Set hyperparameters
xgboost.set_hyperparameters(
    objective='binary:logistic',
    num_round=100,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    eval_metric='auc'
)

# Define data channels
train_input = TrainingInput(s3_data=train_data_s3_path, content_type='csv')
test_input = TrainingInput(s3_data=test_data_s3_path, content_type='csv')

# Train the model
xgboost.fit({'train': train_input, 'validation': test_input})


In [None]:
# Deploy the model to an endpoint
xgboost_predictor = xgboost.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')


In [None]:
from sagemaker.model import ModelPackage

# Register the trained model
model_package_group_name = 'CancerPredictionModelPackageGroup'

# Register the model in the SageMaker Model Registry
model_package = xgboost.register(
    content_types=["text/csv"],                       # input content type
    response_types=["text/csv"],                      # output content type
    inference_instances=["ml.m5.large"],              # instance for inference
    transform_instances=["ml.m5.large"],              # instance for batch transform jobs
    model_package_group_name=model_package_group_name # model package group name for versioning
)

print(f"Model package ARN: {model_package.model_package_arn}")

In [None]:
import boto3

# Initialize a SageMaker client
sagemaker_client = boto3.client('sagemaker')

# List all the endpoints in your account
response = sagemaker_client.list_endpoints()

# Print the list of endpoints
for endpoint in response['Endpoints']:
    print(f"Endpoint Name: {endpoint['EndpointName']}, Status: {endpoint['EndpointStatus']}")
