<h1>C4 Solution</h1>

<h3>Get the data and copy it to S3</h3>

In [1]:
%%capture
!wget https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip

In [2]:
%%capture
!unzip dogImages.zip

In [9]:
%%capture
!aws s3 cp dogImages s3://sagemaker-us-east-1-113018293535/ --recursive

<h3>Install and import</h3>

In [1]:
%%capture
!pip install smdebug torch torchvision tqdm

In [2]:
import sagemaker
import boto3
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, ProfilerRule, rule_configs
from sagemaker.debugger import ProfilerConfig, FrameworkProfile
import os

<h3>Set up parameters, estimator, and tuner</h3>

In [5]:
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.001, 0.1),
    "batch_size": CategoricalParameter([32, 64, 128, 256, 512]),
}

role = sagemaker.get_execution_role()

objective_metric_name = "Test Loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "Test Loss", "Regex": "Testing Loss: ([0-9\\.]+)"}]

In [6]:
estimator = PyTorch(
    entry_point="hpo.py",
    base_job_name='pytorch_dog_hpo',
    role=role,
    framework_version="1.4.0",
    instance_count=1,
    instance_type="ml.m5.xlarge",
    py_version='py3'
)

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=2,
    max_parallel_jobs=2,
    objective_type=objective_type
)

<h3>Fit the tuner</h3>

In [7]:
os.environ['SM_CHANNEL_TRAINING']='s3://sagemaker-us-east-1-113018293535/'
os.environ['SM_MODEL_DIR']='s3://sagemaker-us-east-1-113018293535/model/'
os.environ['SM_OUTPUT_DATA_DIR']='s3://sagemaker-us-east-1-113018293535/output/'
tuner.fit({"training": "s3://sagemaker-us-east-1-113018293535/"})

...............................................................................................................................................................................................................................................................................................................................................................................................................................!


<h3>Describe the tuning results</h3>


In [8]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

exp = HyperparameterTuningJobAnalytics(
  hyperparameter_tuning_job_name='pytorch-training-220225-1914')

jobs = exp.dataframe()

jobs.sort_values('FinalObjectiveValue', ascending=0)

Unnamed: 0,batch_size,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,"""128""",0.015997,pytorch-training-220225-1914-002-2d955779,Completed,581.0,2022-02-25 19:17:11+00:00,2022-02-25 19:38:53+00:00,1302.0
1,"""128""",0.043973,pytorch-training-220225-1914-001-abe5a763,Completed,580.0,2022-02-25 19:16:02+00:00,2022-02-25 19:44:47+00:00,1725.0


## Imp: If kernel dies, how to continue from a completed training job

In [9]:
#BetterTrainingJobName='pytorch-training-210623-2156-001-fdd5e081'

In [10]:
#my_estimator = sagemaker.estimator.Estimator.attach(BetterTrainingJobName)


In [11]:
#my_estimator.hyperparameters()

In [12]:
#best_estimator=my_estimator

<h3>Prepare to perform Training on Best Estimator</h3>

In [13]:
best_estimator=tuner.best_estimator()


2022-03-03 04:32:03 Starting - Preparing the instances for training
2022-03-03 04:32:03 Downloading - Downloading input data
2022-03-03 04:32:03 Training - Training image download completed. Training in progress.
2022-03-03 04:32:03 Uploading - Uploading generated training model
2022-03-03 04:32:03 Completed - Training job completed


In [14]:
best_estimator.hyperparameters()

{'_tuning_objective_metric': '"Test Loss"',
 'batch_size': '"32"',
 'learning_rate': '0.03567339455131772',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"pytorch_dog_hpo-2022-03-03-04-01-48-223"',
 'sagemaker_program': '"hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-113018293535/pytorch_dog_hpo-2022-03-03-04-01-48-223/source/sourcedir.tar.gz"'}

In [15]:
hyperparameters = {"batch_size": int(best_estimator.hyperparameters()['batch_size'].replace('"', '')), \
                   "learning_rate": best_estimator.hyperparameters()['learning_rate']}
hyperparameters

{'batch_size': 32, 'learning_rate': '0.03567339455131772'}

In [16]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [17]:
hook_config = DebuggerHookConfig(
    hook_parameters={
        "train.save_interval": "1",
        "eval.save_interval": "1"
    }
)

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=1)
)

<h2>Creating an Estimator</h2>

In [18]:
# #adjust this cell to accomplish multi-instance training
# estimator = PyTorch(
#     entry_point='hpo.py',
#     base_job_name='dog-pytorch',
#     role=role,
#     instance_count=1,
#     instance_type='ml.m5.xlarge',
#     framework_version='1.4.0',
#     py_version='py3',
#     hyperparameters=hyperparameters,
#     ## Debugger and Profiler parameters
#     rules = rules,
#     debugger_hook_config=hook_config,
#     profiler_config=profiler_config,
# )

In [19]:
# estimator.fit({"training": "s3://sagemaker-us-east-1-113018293535/"}, wait=False)

<h2>Creating an Estimator - Multi-Instance Training,</h2>

In [20]:
###in this cell, create and fit an estimator using multi-instance training
estimator = PyTorch(
    entry_point='hpo.py',
    base_job_name='dog-pytorch',
    role=role,
    instance_count=5,
    instance_type='ml.m5.xlarge',
    framework_version='1.4.0',
    py_version='py3',
    hyperparameters=hyperparameters,
    ## Debugger and Profiler parameters
    rules = rules,
    debugger_hook_config=hook_config,
    profiler_config=profiler_config,
)

In [21]:
estimator.fit({"training": "s3://sagemaker-us-east-1-113018293535/"}, wait=True)

2022-03-03 04:37:18 Starting - Starting the training job...
2022-03-03 04:37:35 Starting - Preparing the instances for trainingVanishingGradient: InProgress
Overfit: InProgress
Overtraining: InProgress
PoorWeightInitialization: InProgress
ProfilerReport: InProgress
.........
2022-03-03 04:39:21 Downloading - Downloading input data.....................
2022-03-03 04:42:59 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[32mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[32mbash: no job control in this shell[0m
[34m2022-03-03 04:42:53,534 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-03-03 04:42:53,537 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-03-03 04:42:53,548 sagemaker_pytorch_container.trainin

CredentialRetrievalError: Error when retrieving credentials from iam-role: Credential refresh failed, response did not contain: access_key, secret_key, token, expiry_time

<h2>Deployment</h2>

In [23]:
model_location=estimator.model_data
model_location

's3://sagemaker-us-east-1-113018293535/dog-pytorch-2022-03-03-04-37-16-413/output/model.tar.gz'

In [24]:
import sagemaker
import boto3
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, ProfilerRule, rule_configs
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor


In [25]:
jpeg_serializer = sagemaker.serializers.IdentitySerializer("image/jpeg")
json_deserializer = sagemaker.deserializers.JSONDeserializer()


class ImagePredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(ImagePredictor, self).__init__(
            endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=jpeg_serializer,
            deserializer=json_deserializer,
        )

In [26]:
pytorch_model = PyTorchModel(model_data=model_location, role=role, entry_point='infernce2.py',py_version='py3',
                             framework_version='1.4',
                             predictor_cls=ImagePredictor)

In [28]:
predictor = pytorch_model.deploy(initial_instance_count=1, instance_type='ml.m5.large',endpoint_name='dog-image-endpoint')


-----!

In [None]:
import requests
#request_dict={ "url": "https://cdn1-www.cattime.com/assets/uploads/2011/12/file_2744_british-shorthair-460x290-460x290.jpg" }
request_dict={ "url": "https://s3.amazonaws.com/cdn-origin-etr.akc.org/wp-content/uploads/2017/11/20113314/Carolina-Dog-standing-outdoors.jpg" }

img_bytes = requests.get(request_dict['url']).content
type(img_bytes)

In [None]:
from PIL import Image
import io
Image.open(io.BytesIO(img_bytes))

In [None]:
response=predictor.predict(img_bytes, initial_args={"ContentType": "image/jpeg"})

In [None]:
import json
response2=predictor.predict(json.dumps(request_dict), initial_args={"ContentType": "application/json"})

In [None]:
type(response2[0][0])

In [None]:
response2[0]

In [None]:
import torch
import numpy as np
np.argmax(response, 1)

In [None]:
predictor