# Data & Model Preparation
This notebook will prepare the dataset and model for the module evaluation lab.  This is an optional step if you have kept your artifacts from previous modules.

## Import modules and initialize parameters for this notebook

In [2]:
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
sess = sagemaker.Session()

account = sess.account_id()
region = sess.boto_region_name
bucket = sess.default_bucket() # or use your own custom bucket name
prefix = 'postprocessing-modal-evaluation'

## Dataset
The dataset we are using is from [Caltech Birds (CUB 200 2011)](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html) dataset contains 11,788 images across 200 bird species (the original technical report can be found here). Each species comes with around 60 images, with a typical size of about 350 pixels by 500 pixels. Bounding boxes are provided, as are annotations of bird parts. A recommended train/test split is given, but image size data is not.

Run the cell below to download the full dataset or download manually [here](https://course.fast.ai/datasets). Note that the file size is around 1.2 GB, and can take a while to download.If you plan to complete the entire workshop, please keep the file to avoid re-download and re-process the data.

In [4]:
!wget 'https://s3.amazonaws.com/fast-ai-imageclas/CUB_200_2011.tgz'
!tar xopf CUB_200_2011.tgz
!rm CUB_200_2011.tgz

s3_raw_data = f's3://{bucket}/{prefix}/full/data'
!aws s3 cp --recursive ./CUB_200_2011 $s3_raw_data
!rm -rf ./CUB_200_2011
!rm -f attributes.txt

In [5]:
from sagemaker.sklearn.processing import SKLearnProcessor

from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput,
)
import time 

timpstamp = str(time.time()).split('.')[0]
# SKlearnProcessor for preprocessing
output_prefix = f'{prefix}/outputs'
output_s3_uri = f's3://{bucket}/{output_prefix}'

class_selection = '13, 17, 35, 36, 47, 68, 73, 87'
input_annotation = 'classes.txt'
processing_instance_type = "ml.m5.xlarge"
processing_instance_count = 1

sklearn_processor = SKLearnProcessor(base_job_name = f"{prefix}-preprocess",  # choose any name
                                    framework_version='0.20.0',
                                    role=role,
                                    instance_type=processing_instance_type,
                                    instance_count=processing_instance_count)

In [7]:
sklearn_processor.run(
    code='preprocessing.py',
    arguments=["--classes", class_selection, 
               "--input-data", input_annotation],
    inputs=[ProcessingInput(source=s3_raw_data, 
            destination="/opt/ml/processing/input")],
    outputs=[
            ProcessingOutput(source="/opt/ml/processing/output/train", destination = output_s3_uri +'/train'),
            ProcessingOutput(source="/opt/ml/processing/output/valid", destination = output_s3_uri +'/valid'),
            ProcessingOutput(source="/opt/ml/processing/output/test", destination = output_s3_uri +'/test'),
            ProcessingOutput(source="/opt/ml/processing/output/manifest", destination = output_s3_uri +'/manifest'),
        ],
    )


Job Name:  postprocessing-modal-evaluation-preproc-2022-07-23-19-28-28-837
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation/full/data', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation-preproc-2022-07-23-19-28-28-837/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation/outputs/train', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode'

This is where your images and annotation files are located.  You will need these for this module.

In [8]:
print(f"Test dataset located here: {output_s3_uri +'/test'} ===========")

print(f"Test annotation file is located here: {output_s3_uri +'/manifest'} ===========")



In [9]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.tensorflow import TensorFlow

TF_FRAMEWORK_VERSION = '2.3.2'

hyperparameters = {'initial_epochs':     5,
                   'batch_size':         8,
                   'fine_tuning_epochs': 20, 
                   'dropout':            0.4,
                   'data_dir':           '/opt/ml/input/data'}

metric_definitions = [{'Name': 'loss',      'Regex': 'loss: ([0-9\\.]+)'},
                  {'Name': 'acc',       'Regex': 'accuracy: ([0-9\\.]+)'},
                  {'Name': 'val_loss',  'Regex': 'val_loss: ([0-9\\.]+)'},
                  {'Name': 'val_acc',   'Regex': 'val_accuracy: ([0-9\\.]+)'}]


distribution = {'parameter_server': {'enabled': False}}
DISTRIBUTION_MODE = 'FullyReplicated'
    
train_in = TrainingInput(s3_data=output_s3_uri +'/train', distribution=DISTRIBUTION_MODE)
val_in   = TrainingInput(s3_data=output_s3_uri +'/valid', distribution=DISTRIBUTION_MODE)
test_in  = TrainingInput(s3_data=output_s3_uri +'/test', distribution=DISTRIBUTION_MODE)

inputs = {'train':train_in, 'test': test_in, 'validation': val_in}

training_instance_type = 'ml.c5.4xlarge'

training_instance_count = 1

In [12]:
model_path = f"s3://{bucket}/{prefix}"

estimator = TensorFlow(entry_point='train-mobilenet.py',
               source_dir='code',
               output_path=model_path,
               instance_type=training_instance_type,
               instance_count=training_instance_count,
               distribution=distribution,
               hyperparameters=hyperparameters,
               metric_definitions=metric_definitions,
               role=role,
               framework_version=TF_FRAMEWORK_VERSION, 
               py_version='py37',
               base_job_name=prefix,
               script_mode=True)

In [13]:
estimator.fit(inputs)

2022-07-23 19:38:25 Starting - Starting the training job...
2022-07-23 19:38:50 Starting - Preparing the instances for trainingProfilerReport-1658605105: InProgress
......
2022-07-23 19:39:50 Downloading - Downloading input data...
2022-07-23 19:40:10 Training - Downloading the training image..[34m2022-07-23 19:40:37.517885: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-07-23 19:40:37.523133: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2022-07-23 19:40:37.719547: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-07-23 19:40:40,553 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2022-07-23 19:40:40,561 sagemaker-training-toolkit INFO     No GPUs dete

In [14]:
training_job_name = estimator.latest_training_job.name

print(f"model artifacts file is uploaded here: {model_path}/{training_job_name}/output ========")





## Model Evaluation Using Tensorflow frameprocessor

In [28]:
import boto3
from sagemaker.tensorflow import TensorFlow, TensorFlowProcessor
from sagemaker import image_uris

from sagemaker.processing import FrameworkProcessor, ProcessingInput, ProcessingOutput, Processor
from sagemaker import get_execution_role

import uuid

image_uri = image_uris.retrieve(
    framework="tensorflow",
    region=region,
    version="2.3.2",
    image_scope="training",
    py_version="py37",
    instance_type='ml.m5.xlarge', # ParameterString
)
print(image_uri)

s3_evaluation_output = f's3://{bucket}/{prefix}/outputs/evaluation'

tensorflow_processor = FrameworkProcessor(
                        estimator_cls=TensorFlow,
                        image_uri=image_uri,
                        framework_version='2.3.2',
                        py_version="py37",
                        base_job_name = prefix,
                        command=['python3'],
                        role=role,
                        instance_count=1,
                        instance_type='ml.m5.xlarge')

763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.3.2-cpu-py37


In [None]:
s3_images = f'{output_s3_uri}/test'
s3_manifest = f'{output_s3_uri}/manifest'
s3_model = f'{model_path}/{training_job_name}/output'

print(s3_images, s3_manifest, s3_model)
tensorflow_processor.run(
                code='evaluation.py',
                source_dir = 'src_dir',
                arguments=["--model-file", "model.tar.gz"],
                inputs=[ProcessingInput(source=s3_images, 
                                        destination="/opt/ml/processing/input/test"),
                        ProcessingInput(source=s3_manifest, 
                                        destination="/opt/ml/processing/input/manifest"),
                        ProcessingInput(source=s3_model, 
                                        destination="/opt/ml/processing/model"),
                       ],
                outputs=[
                    ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", 
                                     destination=s3_evaluation_output),
                ]
            )

s3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation/outputs/test s3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation/outputs/manifest s3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation/postprocessing-modal-evaluation-2022-07-23-19-38-24-988/output

Job Name:  postprocessing-modal-evaluation-2022-07-23-20-16-30-517
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation/outputs/test', 'LocalPath': '/opt/ml/processing/input/test', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-987720697751/postprocessing-modal-evaluation/outputs/manifest', 'LocalPath': '/opt/ml/processing/input/manifest', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplica