In [None]:
!pip install s3fs

# import libraries

In [None]:
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.model import ModelPackage


sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)

region = boto3.Session().region_name

model_package_group_name = "PipelineModelPackageGroup"
pipeline_name = "linear-linear"  # SageMaker Pipeline name

# Define bucket, prefix, role

role = get_execution_role()

# SageMaker session
sagemaker_session = sagemaker.Session()

# import input data from S3

In [None]:

data_key = f'cancer-prediction-ml-model/test-data.csv'

train_input_dir = f's3://{bucket}/{data_key}'

data = pd.read_csv(train_input_dir)
data.to_csv('input-data.csv')
data.head(10)

In [None]:
train_input_dir

# Define a Processing Step for Feature Engineering <a class="anchor" id="training"></a>


In [None]:
%%writefile preprocess.py

import os
import boto3
import argparse
import sagemaker
import pandas as pd

from sagemaker import get_execution_role
from sklearn.preprocessing import OneHotEncoder

# split data into train and test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocessing_data(input_data_dir):
    
    bucket = input_data_dir.split('/')[2]
    prefix = "ll-ml-model"
    
    # SageMaker session
    sagemaker_session = sagemaker.Session()
    input_data = pd.read_csv(input_data_dir)
    df = input_data.copy()

    # Encode categorical features
    le_gender = LabelEncoder()
    le_cancer_type = LabelEncoder()
    df['outcome'] = df['outcome'].apply(lambda x: 1 if x == 'survived' else 0)
    df['gender'] = le_gender.fit_transform(df['gender'])
    df['cancer_type'] = le_cancer_type.fit_transform(df['cancer_type'])
    
    # Separate features and labels
    X = df.drop('outcome', axis=1)
    y = df['outcome']
    
    # Scale the numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
    
    # Convert to CSV format and save locally
    # Ensure the label is the first column in the dataset
    train_data = pd.concat([y_train.reset_index(drop=True), pd.DataFrame(X_train)], axis=1)
    test_data = pd.concat([y_test.reset_index(drop=True), pd.DataFrame(X_test)], axis=1)
    
    # Save the datasets as CSV files without headers
    train_data.to_csv('train.csv', index=False, header=False)
    test_data.to_csv('validation.csv', index=False, header=False)



    # Upload the data to S3
    train_data_s3_path = sagemaker_session.upload_data(path='train.csv', bucket=bucket, key_prefix=f"{prefix}/train")
    print('Saved Train data', train_data_s3_path)
    test_data_s3_path = sagemaker_session.upload_data(path='validation.csv', bucket=bucket, key_prefix=f"{prefix}/validation")
    print('Saved Validation data', test_data_s3_path)
    return train_data_s3_path, test_data_s3_path

    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('input_data_dir')
    args = parser.parse_args()
    preprocessing_data(args.input_data_dir)




In [None]:
train_input_dir

In [None]:
# ! python preprocess.py 'test-data.csv'

In [None]:

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
role = get_execution_role()
prefix = "ll-ml-model"

# Set the location of the train and validation data in S3

s3_train_data = f's3://{bucket}/{prefix}/train/train.csv'
s3_validation_data = f's3://{bucket}/{prefix}/validation/validation.csv'

# Set up the Linear Learner container
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

# Create the Linear Learner estimator
linear = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session
)

# Set the hyperparameters
linear.set_hyperparameters(
    predictor_type='binary_classifier',
    mini_batch_size=10
)

# Create input channels
train_input = sagemaker.inputs.TrainingInput(s3_train_data, content_type='text/csv')
validation_input = sagemaker.inputs.TrainingInput(s3_validation_data, content_type='text/csv')

# Train the model
linear.fit({'train': train_input, 'validation': validation_input})


In [None]:
# Deploy the model
linear_predictor = linear.deploy(initial_instance_count=1, instance_type='ml.m5.large')


In [None]:
# Delete enpoint
linear_predictor.delete_endpoint()


In [None]:

# Register the model in the SageMaker Model Registry
model_package = linear.register(
    content_types=["text/csv"],                       # input content type
    response_types=["text/csv"],                      # output content type
    inference_instances=["ml.m5.large"],              # instance for inference
    transform_instances=["ml.m5.large"],              # instance for batch transform jobs
    model_package_group_name=model_package_group_name # model package group name for versioning
)

print(f"Model package ARN: {model_package.model_package_arn}")