## Setup

---
import the libraries and intialize the parameters for this workshop

In [7]:
import sagemaker
import boto3
import os
import time
import datetime
import json
import numpy as np
import pprint as pp

from collections import namedtuple
from collections import defaultdict
from collections import Counter



import matplotlib.pyplot as plt

sagemaker_session = sagemaker.Session()

default_bucket = sagemaker_session.default_bucket() # or use your own custom bucket name
region = sagemaker_session.boto_region_name
account = sagemaker_session.account_id()
role = sagemaker.get_execution_role()

sess = sagemaker.Session()

base_job_prefix = 'frame-processor'

pipeline_name = f"{base_job_prefix}-pipeline"  # SageMaker Pipeline name

print(sagemaker.__version__)

2.94.0


## Download Dataset

In [None]:
!wget 'https://s3.amazonaws.com/fast-ai-imageclas/CUB_200_2011.tgz'
!tar xopf CUB_200_2011.tgz
!rm CUB_200_2011.tgz

s3_raw_data = f's3://{bucket}/{prefix}/full/data'
!aws s3 cp --recursive ./CUB_200_2011 $s3_raw_data

!rm -rf ./CUB_200_2011

## Build a Training Pipeline
---

The pipelines configured includes python package under SageMaker Pipelines together with the defined code for preprocessing, training, and model evaluation to automate the model training. It is easy to use such that you can simple drop in whatever input data for image classification you want and have it train a model automatically.

### Preprocessing Script

---
Here is teh preprocessing script. we are using script processor from sageMaker processing to split the data into train, valid, and test channels, and then build the TFRecord file for pipe mode training.

In [None]:
!pygmentize 'pipeline/preprocess.py'

In [5]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

input_data = ParameterString(
    name="InputDataUrl",
    default_value='s3://sagemaker-us-west-2-987720697751/frame-processor/full/data'
)

input_annotation = ParameterString(
    name="AnnotationFileName",
    default_value="classes.txt"
)

# This is a large dataset, we are only going to train a subset of the classes
class_selection = ParameterString(
    name="ClassSelection",
    default_value="13, 17, 35, 36, 47, 68, 73, 87" #If use the mini dataset, please make sure to use the class index with the available list
)

In [6]:
from sagemaker.workflow.steps import ProcessingStep

from sagemaker.sklearn import SKLearn, SKLearnProcessor

from sagemaker.processing import FrameworkProcessor, ProcessingInput, ProcessingOutput

from sagemaker.workflow.pipeline_context import PipelineSession

import uuid

# SKlearnProcessor for preprocessing

session = PipelineSession()

sklearn_processor = FrameworkProcessor(
                                    estimator_cls=SKLearn,
                                    framework_version='0.23-1',
                                    base_job_name = base_job_prefix,
                                    command=['python3'],
                                    role=role,
                                    instance_count=1,
                                    instance_type='ml.m5.xlarge',
                                    sagemaker_session = session)


output_s3_uri = f's3://{default_bucket}/{base_job_prefix}/outputs'

step_args = sklearn_processor.run(
            code='preprocessing.py',
            arguments=["--classes", class_selection, 
                       "--input-data", input_annotation],
            inputs=[ProcessingInput(source=input_data, 
                    destination="/opt/ml/processing/input")],
            outputs=[
                    ProcessingOutput(source="/opt/ml/processing/output/train", destination = output_s3_uri +'/train'),
                    ProcessingOutput(source="/opt/ml/processing/output/valid", destination = output_s3_uri +'/valid'),
                    ProcessingOutput(source="/opt/ml/processing/output/test", destination = output_s3_uri +'/test'),
                    ProcessingOutput(source="/opt/ml/processing/output/manifest", destination = output_s3_uri +'/manifest'),
                ],
            )

step_process = ProcessingStep(
    name="FrameworkProcessor",  # choose any name
    step_args=step_args)


Job Name:  frame-processor-2022-06-11-15-04-45-688
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': ParameterString(name='InputDataUrl', parameter_type=<ParameterTypeEnum.STRING: 'String'>, default_value='s3://sagemaker-us-west-2-987720697751/frame-processor/full/data'), 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-987720697751/frame-processor-2022-06-11-15-04-45-688/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-987720697751/frame-processor-2022-06-11-15-04-45-688/source/runproc.sh', 'LocalPath': '/opt/ml/p



In [8]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        input_data,
        input_annotation,
        class_selection
    ],
    steps=[step_process],
    sagemaker_session=session,
)

# Submit pipline
pipeline.upsert(role_arn=role)

# Execute pipeline using the default parameters.
execution = pipeline.start()