In [None]:
import sagemaker

# Get a SageMaker-compatible role used by this Notebook Instance.
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# S3 prefix
prefix = 'scikit-iris'

In [2]:
import numpy as np
import pandas as pd
import os
from sklearn import datasets

# Load Iris dataset, then join labels and features together
iris = datasets.load_iris()
joined_iris = np.insert(iris.data, 0, iris.target, axis=1)

df = pd.DataFrame(joined_iris, columns = ['label','sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)'])
train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))])

# Create a temporary directory and write the dataset as CSV
os.makedirs('./data', exist_ok=True)
#np.savetxt('./data/iris.csv', joined_iris, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')
train_data.to_csv('data/train.csv', index=False, header=False)
validation_data.to_csv('data/validation.csv', index=False, header=False)
test_data.to_csv('data/test.csv', index=False, header=False)

In [21]:
# Split testing data and label for later model deploy test
test_y = test_data.iloc[:,0]
test_X = test_data.iloc[:,1:]

In [None]:
# Upload the dataset to S3
s3_input_train = sess.upload_data('data/train.csv', bucket=bucket, key_prefix=f'{prefix}/data')
s3_input_validation = sess.upload_data('data/validation.csv', bucket=bucket, key_prefix=f'{prefix}/data')
s3_input_test = sess.upload_data('data/test.csv', bucket=bucket, key_prefix=f'{prefix}/test')

## Training

The below script contains both training and inference functionality and can run both in SageMaker Training hardware or locally (desktop, SageMaker notebook, on prem, etc). Detailed guidance here https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html#train-a-model-with-scikit-learn

In [9]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn('sklearn_iris.py', #/opt/ml/code/<script file>
                  instance_type="ml.c5.xlarge",
                  instance_count=1,
                  framework_version='0.20.0',
                  py_version='py3',
                  role=role,
                  sagemaker_session=sess,
                  metric_definitions=[{"Name": "model_accuracy", 
                                       "Regex": "Model Accuracy: ([0-9.]+).*$"}
                                     ],
                  hyperparameters={'max_leaf_nodes': 30} 
                  #/opt/ml/input/config/hyperparameters.json
                )

sklearn_estimator.fit({"train": s3_input_train, "test": s3_input_validation})

NameError: name 'role' is not defined

In [84]:
sklearn_estimator.fit({"train": s3_input_train, "test": s3_input_validation})

2022-06-01 05:39:34 Starting - Starting the training job...
2022-06-01 05:39:59 Starting - Preparing the instances for trainingProfilerReport-1654061974: InProgress
......
2022-06-01 05:41:00 Downloading - Downloading input data...
2022-06-01 05:41:20 Training - Downloading the training image.[34m2022-06-01 05:41:37,456 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-06-01 05:41:37,458 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-01 05:41:37,471 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-06-01 05:41:37,809 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-01 05:41:37,823 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-01 05:41:37,835 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-

## Deploy an Endpoint 

Now that we've trained the algorithm on our data, let's deploy a model that's hosted behind a real-time endpoint.

In [None]:
predictor = sklearn_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

In [94]:
print("Predictions: {}".format(predictor.predict(test_X.values)))
print("Actual: {}".format(test_y.values))

In [32]:
predictor.delete_endpoint(delete_endpoint_config=True)

## Deploy an Endpoint from Model Data (Optional)

In [87]:
# Download the trained model data from S3
from sagemaker.s3 import S3Downloader

s3_model_path = sklearn_estimator.model_data

S3Downloader.download(s3_uri=s3_model_path,
                          local_path='./',
                          sagemaker_session=sess)

#or

#import boto3

#s3_client = boto3.client('s3')
#training_job_name = sklearn_estimator.latest_training_job.name

#with open('model.tar.gz', 'wb') as data:
#    s3_client.download_fileobj(Bucket=bucket, Key=f'{training_job_name}/output/model.tar.gz', Fileobj=data)

!tar -zxvf model.tar.gz

model.joblib


### Model Directory Structure

The contents of model.tar.gz should be organized as follows:

- Model files in the top-level directory
- Inference script (and any other source files) in a directory named code/ (for more about the inference script, see The SageMaker Scikit-learn Model Server)
- Optional requirements file located at code/requirements.txt (for more about requirements files, see Using third-party libraries)

<b>model.tar.gz</b>
- model.joblib
- code
     - inference.py
     - requirements.txt

### Inference Script function definition 
- <b>model_fn:</b> Load the model file in model directory (eg. /opt/ml/model/model.pth).
- <b>input_fn:</b> Deserialize the Invoke request body into an object we can perform prediction on.
- <b>predict_fn:</b> Perform prediction on the deserialized object, with the loaded model.
- <b>output_fn:</b> Serialize the prediction result into the desired response content type.

https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html#load-a-model

In [90]:
# Package model data and inference script to model.tar.gz
!mkdir code
!cp inference.py code/
!tar -czvf sklearn-model.tar.gz model.joblib code
model_path = sess.upload_data(path='sklearn-model.tar.gz', key_prefix=f'{prefix}/models')

mkdir: cannot create directory ‘code’: File exists
model.joblib
code/
code/inference.py


In [93]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(model_data=model_path,
                     framework_version='0.20.0',
                     py_version='py3',
                     role=role,
                     sagemaker_session=sess,
                     entry_point='code/inference.py',
                    )
predictor = model.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

----!

In [None]:
print("Predictions: {}".format(predictor.predict(test_X.values)))
print("Actual: {}".format(test_y.values))

In [None]:
predictor.delete_endpoint(delete_endpoint_config=True)

## Batch Transform

In [20]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_estimator.transformer(instance_count=2, instance_type='ml.m5.xlarge')

In [25]:
# Start a transform job and wait for it to finish
transformer.transform(s3_input_test, content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()

.......................[34mProcessing /opt/ml/code
  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
[34mBuilding wheels for collected packages: scikit-learn-iris
  Building wheel for scikit-learn-iris (setup.py): started
  Building wheel for scikit-learn-iris (setup.py): finished with status 'done'
  Created wheel for scikit-learn-iris: filename=scikit_learn_iris-1.0.0-py2.py3-none-any.whl size=5142 sha256=12f35d26b3bd9189499788bf1ee661b32eb30abca93bbcb5fcd3dc5149f4387a
  Stored in directory: /tmp/pip-ephem-wheel-cache-ohskq4a3/wheels/3e/0f/51/2f1df833dd0412c1bc2f5ee56baac195b5be563353d111dca6[0m
[34mSuccessfully built scikit-learn-iris

In [28]:
# Download the output data from S3 to local filesystem
batch_output = transformer.output_path
!aws s3 cp --recursive $batch_output/ batch_data/

download: s3://sagemaker-ap-east-1-468208999430/sagemaker-scikit-learn-2022-05-31-07-43-06-362/iris.csv.out to batch_data/iris.csv.out


## Automatic model Tuning (optional)

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {"n-max_leaf_nodes": IntegerParameter(5, 100)}
objective_metric_name = 'model_accuracy'

tuner = HyperparameterTuner(sklearn_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions=[{"Name": "model_accuracy", 
                                                 "Regex": "Model Accuracy: ([0-9.]+).*$"}],
                            objective_type='Maximize',
                            max_jobs=9,
                            max_parallel_jobs=3)

tuner.fit({'train': s3_input_train, 'test': s3_input_validation}, wait=False) #wait=False to set async training job

In [None]:
# Wait for the Hyperparameter Tuning Job to be completed

tuner.wait()

#or

#tuner.logs()

#or

#tuning_job_name = sklearn_estimator.latest_training_job.name
#sess.wait_for_tuning_job(tuner.latest_tuning_job.name)

In [None]:
# return the best training job name
tuner.best_training_job()

In [None]:
# Deploy the best trained or user specified model to an Amazon SageMaker endpoint
tuner_predictor = tuner.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

In [None]:
# Deploy the best one and predict
print("Predictions: {}".format(tuner_predictor.predict(test_X.values)))
print("Actual: {}".format(test_y.values))

In [None]:
tuner_predictor.delete_endpoint(delete_endpoint_config=True)