## Preparing the dataset.


In [1]:
# Read the dataset
with open(r'wdbc.data', 'r') as file:
    data = file.read()

# Replace commas with semicolons
csv_data = data.replace(',', ';')

# Save as CSV file
with open('breast-cancer-wisconsin.csv', 'w') as file:
    file.write(csv_data)

## Training an XGBOOST for binary classification

In [5]:

import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris
#from sagemaker.predictor import csv_serializer

session = sagemaker.Session() #ToDo:retreive session with BOTO3

role = get_execution_role()

# If you're following along, you'll need to upload these datasets to your own bucket in S3. 

train_location = "s3://breast-cancer-wisconsin/train.csv"
val_location = "s3://breast-cancer-wisconsin/valid.csv"
test_location = "s3://breast-cancer-wisconsin/test.csv"

# We use this prefix to help us determine where the output will go. 

prefix = "Output"
Bucket = "breast-cancer-wisconsin"

# We need to get the location of the container. 

container = image_uris.retrieve('xgboost', session.boto_region_name, version='latest')

# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    instance_count=1, # The number of instances to use for training
                                    instance_type='ml.m4.xlarge', # The type of instance to use for training
                                    output_path='s3://{}/{}'.format(Bucket, prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session
             
# These hyperparameters are beyond the scope of this course, but you can research the algoirthm here: 
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html    
    
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',  #
                        early_stopping_rounds=10,
                        num_round=200)
                        
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

# The fit method launches the training job. 

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})
 

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-2024-07-20-15-47-24-706


2024-07-20 15:47:24 Starting - Starting the training job...
2024-07-20 15:47:50 Starting - Preparing the instances for training...
2024-07-20 15:48:24 Downloading - Downloading input data...
2024-07-20 15:48:49 Downloading - Downloading the training image......
2024-07-20 15:49:55 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2024-07-20:15:50:05:INFO] Running standalone xgboost training.[0m
[34m[2024-07-20:15:50:05:INFO] File size need to be processed in the node: 0.1mb. Available memory size in the node: 8451.19mb[0m
[34m[2024-07-20:15:50:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:50:05] S3DistributionType set as FullyReplicated[0m
[34m[15:50:05] 343x31 matrix with 10633 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-07-20:15:50:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:50:05] S3DistributionType set as FullyReplicated[0m
[34m[15:50:

In [4]:
import sagemaker 
from sagemaker import get_execution_role, image_uris
from sagemaker.model import Model
session = sagemaker.Session() #ToDo:retreive session with BOTO3

container = image_uris.retrieve('xgboost', session.boto_region_name, version='latest')

 # Define your SageMaker role 
role = get_execution_role() 

# Create a SageMaker model  Create a SageMaker model 
model = Model(model_data="s3://breast-cancer-wisconsin/Output/xgboost-2024-07-20-15-47-24-706/output/model.tar.gz", 
	role=role, 
	image_uri= container, 
	) # Inference script 

<sagemaker.model.Model at 0x7fdef5019420>