# Breast Cancer PrePreocessing Job

In [6]:
%%writefile breast_cancer.py

# Execute this cell first to write this script to your local directory. 

import pandas

# This method filters out the column at index 1
def filter_crime_data(input_data_path):
    with open(input_data_path, 'r') as f:
        df = pandas.read_csv(f)
    df = df.drop(df.columns[[1]], axis=1)
    return df

# The main method takes in data at '/opt/ml/processing/input/data/train.csv' 
# and outputs it as a csv to '/opt/ml/processing/output/data_processed'

if __name__ == "__main__":
    
    filtered_data = filter_crime_data('/opt/ml/processing/input/BCdata1/train.csv')
    filtered_data.to_csv('/opt/ml/processing/output/train.csv',index  = False)
    filtered_data = filter_crime_data('/opt/ml/processing/input/BCdata2/valid.csv')
    filtered_data.to_csv('/opt/ml/processing/output/valid.csv',index  = False)
    



Writing breast_cancer.py


In [21]:
import boto3

from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

role = get_execution_role()

sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m5.large',
                                     instance_count=1)

#Files will be copied from s3 bucket to the local path of the processor
inputs = [ProcessingInput(source = "s3://breast-cancer-2024/train.csv" , destination = "/opt/ml/processing/input/BCdata1/"),
         ProcessingInput(source  = "s3://breast-cancer-2024/valid.csv" , destination = "/opt/ml/processing/input/BCdata2/")
                         ]
outputs = [ProcessingOutput(source = "/opt/ml/processing/output/",
                           destination = "s3://breast-cancer-2024/SKLearnProcessor Output/")]

#The output folder in the processor where the outputs will be saved
sklearn_processor.run(code='breast_cancer.py',
                        inputs= inputs,
                      outputs = outputs
                     )


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-08-02-19-25-34-869


...............
..

### The following cells will train an xgboost estimator to classify the tumor Similar as previous session#16
However, we will use the training and validation datasets that were the outputs of the previous processing jobs

In [22]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris
#from sagemaker.predictor import csv_serializer

session = sagemaker.Session() #ToDo:retreive session with BOTO3

role = get_execution_role()

# If you're following along, you'll need to upload these datasets to your own bucket in S3. 

train_location = "s3://breast-cancer-2024/SKLearnProcessor Output/train.csv" ##############
val_location = "s3://breast-cancer-2024/SKLearnProcessor Output/valid.csv" ##########

# We use this prefix to help us determine where the output will go. 

prefix = "Output"
Bucket = "breast-cancer-2024" ###################

# We need to get the location of the container. 

container = image_uris.retrieve('xgboost', session.boto_region_name, version='latest')

# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    instance_count=1, # The number of instances to use for training
                                    instance_type='ml.m4.xlarge', # The type of instance to use for training
                                    output_path='s3://{}/{}'.format(Bucket, prefix),
                                    # Where to save the output (the model artifacts)
                                    sagemaker_session = session) # The current SageMaker session
             
# These hyperparameters are beyond the scope of this course, but you can research the algoirthm here: 
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html    
    
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',  #
                        early_stopping_rounds=10,
                        num_round=200)
                        
# Preparing the files uploaded to s3 for the training job    
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

# The fit method launches the training job. 
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-2024-08-02-19-32-32-785


2024-08-02 19:32:32 Starting - Starting the training job...
2024-08-02 19:32:49 Starting - Preparing the instances for training...
2024-08-02 19:33:22 Downloading - Downloading input data...
2024-08-02 19:33:57 Downloading - Downloading the training image......
2024-08-02 19:34:48 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2024-08-02:19:35:00:INFO] Running standalone xgboost training.[0m
[34m[2024-08-02:19:35:00:INFO] File size need to be processed in the node: 0.11mb. Available memory size in the node: 8443.02mb[0m
[34m[2024-08-02:19:35:00:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:35:00] S3DistributionType set as FullyReplicated[0m
[34m[19:35:00] 343x30 matrix with 10290 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-08-02:19:35:00:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:35:00] S3DistributionType set as FullyReplicated[0m
[34m[19:35:

### Create a lambda function by using BOTO3

In [30]:
%%writefile lambda_function.py

import json

def lambda_handler(event, context):
    # TODO implement
    return {
        'statusCode': 200,
        'body': json.dumps('Hello from Lambda!')
    }


Writing lambda_function.py


In [32]:
from zipfile import ZipFile

#Zip the lambda_function.py file and use it to be an input of the lambda function
with ZipFile('code.zip','w') as f:
    f.write('lambda_function.py')
    
with open('code.zip','rb') as f:#read code as bytes
    b_code =  f.read()
    

In [34]:
import boto3
from sagemaker import get_execution_role

role = get_execution_role()
client = boto3.client('lambda')

client.create_function(FunctionName = "Lambda_Notebook", 
                      Runtime = "python3.9",
                      Handler = "lambda_function.lambda_handler",
                      Code = {"ZipFile": b_code},
                      Timeout = 60,
                       MemorySize = 1024,
                       Publish =True,
                       PackageType= 'Zip',
                       Role = "arn:aws:iam::091788420555:role/Lambda-Execution-role" 
                      )

{'ResponseMetadata': {'RequestId': '41186533-ff00-4634-a3a7-0241e7103ecb',
  'HTTPStatusCode': 201,
  'HTTPHeaders': {'date': 'Fri, 02 Aug 2024 20:01:19 GMT',
   'content-type': 'application/json',
   'content-length': '1332',
   'connection': 'keep-alive',
   'x-amzn-requestid': '41186533-ff00-4634-a3a7-0241e7103ecb'},
  'RetryAttempts': 0},
 'FunctionName': 'Lambda_Notebook',
 'FunctionArn': 'arn:aws:lambda:us-east-1:091788420555:function:Lambda_Notebook',
 'Runtime': 'python3.9',
 'Role': 'arn:aws:iam::091788420555:role/Lambda-Execution-role',
 'Handler': 'lambda_function.lambda_handler',
 'CodeSize': 300,
 'Description': '',
 'Timeout': 60,
 'MemorySize': 1024,
 'LastModified': '2024-08-02T20:01:19.150+0000',
 'CodeSha256': 'unkpH0nA/3VvInfjZlCbpIHk6HWFLQ9K8UD+nWV72aE=',
 'Version': '1',
 'TracingConfig': {'Mode': 'PassThrough'},
 'RevisionId': '2d3dc091-b0ab-4f53-9fb8-0d19e0b9ee03',
 'State': 'Pending',
 'StateReason': 'The function is being created.',
 'StateReasonCode': 'Creatin

In [25]:
role

'arn:aws:iam::091788420555:role/service-role/AmazonSageMaker-ExecutionRole-20240726T183496'

## Invoking Lambda Functions

Don't forget to add Lmbda Full access policy to the notebook IAM role

In [39]:

import boto3
from sagemaker import get_execution_role

role = get_execution_role()
client = boto3.client('lambda')



In [40]:
import json
payload =   {"s3URI":"toys-reviews/reviews_Patio_Lawn_and_Garden_5.json.zip"}


payload_bytes = json.dumps(payload).encode('utf-8')

response =client.invoke(FunctionName = "Toys-Reviews",
                       InvocationType = "Event",
                       Payload = payload_bytes)

In [37]:
response

{'ResponseMetadata': {'RequestId': '0d1920fd-651f-43f6-97ad-09f57c3740c1',
  'HTTPStatusCode': 202,
  'HTTPHeaders': {'date': 'Fri, 02 Aug 2024 20:42:11 GMT',
   'content-length': '0',
   'connection': 'keep-alive',
   'x-amzn-requestid': '0d1920fd-651f-43f6-97ad-09f57c3740c1',
   'x-amzn-remapped-content-length': '0',
   'x-amzn-trace-id': 'root=1-66ad44a3-2afa0b6f427b37d57af2c824;parent=0aa5bee1fe3f1e8c;sampled=0'},
  'RetryAttempts': 0},
 'StatusCode': 202,
 'Payload': <botocore.response.StreamingBody at 0x7fdbb1b3d060>}