# TODO: Title
**TODO**: Give a helpful introduction to what this notebook is for. Remember that comments, explanations and good documentation make your project informative and professional.

**Note:** This notebook has a bunch of code and markdown cells with TODOs that you have to complete. These are meant to be helpful guidelines for you to finish your project while meeting the requirements in the project rubrics. Feel free to change the order of the TODO's and/or use more than one cell to complete all the tasks.

In [None]:
# TODO: Install any packages that you might need
!pip install smdebug -q -U
!pip install torchvision -q
!pip install timm -q
!pip install bokeh -q

In [None]:
pip install --upgrade boto3

In [None]:
pip install --upgrade sagemaker==2.219.0

In [None]:
# Bokeh was needed for ProfileReport, but new version was raising error, older version is working fine. 
!pip install --upgrade bokeh==2.4.3

In [None]:
# TODO: Import any packages that you might need
import sagemaker
from sagemaker import Session
from sagemaker.tuner import (IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner, HyperparameterTuningJobAnalytics)
from sagemaker.pytorch import PyTorch
from sagemaker.debugger import (Rule, ProfilerRule, rule_configs, DebuggerHookConfig, ProfilerConfig, FrameworkProfile)
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.serializers import IdentitySerializer, JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.estimator import Estimator
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts
import boto3
import shutil
import requests
import os
import io
from io import BytesIO
import json
import random
from tqdm import tqdm
import pandas as pd
import IPython
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import torchvision.transforms as transforms

In [None]:
notebook_config = {
    'run_hyperparameter_job': 0,  # 1 if you want to re-train hyperparameter job; 0 if you want to fetch already trained one
    'train_a_model': 0  # 1 if you want to re-train a model, 0 if you want to fetch an already trained model
    }

In [None]:
# This is basic and default session opening method
#sagemaker_session = sagemaker.Session()

# Initialize the SageMaker client
# Session method over boto3 is more customizable, you can decide on several parameters like aws key etc. 
session = boto3.Session()
region = session.region_name
sagemaker_session = Session(boto_session=session)
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
sagemaker_client = boto3.client('sagemaker')

In [None]:
print(f"Default Bucket: {bucket}")
print(f"RoleArn: {role}")

In [None]:
train = "s3://sagemaker-us-east-1-462848125024/aws_capstone_project/train_data/"
valid = "s3://sagemaker-us-east-1-462848125024/aws_capstone_project/valid_data/"
test = "s3://sagemaker-us-east-1-462848125024/aws_capstone_project/test_data/"

In [None]:
# In this cell we are defining some functions that will be used inside the notebook

def train_hyperparameter_tuning_job(train, test, valid):
    """
    This function is responsible for fitting a hyperparameter job from scratch if there is not any, or you want to train one from scratch. 
    Hyperparameter ranges and selections need to be configured inside this function, they are not embedded into a seperate config.
    Also PyTorch and Tuner object settings need to be configured here. 

    Inputs:
        train: You need to give train data directory from S3 object
        test: You need to give test data directory from S3 object
        valid: You need to give validation data directory from S3 object

    Outputs:
        best_estimator: This is the best estimator object from best hyperparameters
    """

    hyperparameter_ranges = {
        "batch-size": CategoricalParameter([8, 128]),
        "epochs": IntegerParameter(4, 16)
    }

    objective_metric_name = "average test loss"
    objective_type = "Minimize"
    metric_definitions = [{"Name": "average test loss", "Regex": "Test Loss: ([+-]?[0-9\\.]+)"}]

    estimator = PyTorch(
        entry_point="hpo_revised.py",
        base_job_name='pyt-hpo-rev',
        role=role,
        py_version="py38",
        framework_version="1.12",
        instance_count=1,
        instance_type="ml.g4dn.xlarge"
    )

    tuner = HyperparameterTuner(
        estimator=estimator,
        early_stopping_type="Auto",
        metric_definitions=metric_definitions,
        objective_metric_name=objective_metric_name,
        objective_type=objective_type,
        max_jobs=4,
        max_parallel_jobs=2,
        hyperparameter_ranges=hyperparameter_ranges,
        base_tuning_job_name='pyt-hpo-rev'
    )

    tuner.fit(
        {
            "training": train,
            "valid": valid,
            "test": test,
        },
        wait=True
    )

    best_estimator = tuner.best_estimator()

    best_estimator.hyperparameters()

    return best_estimator, tuner

def train_hyp_tuning_job_summarizer(tuner):
    """
    This function summarizes the information related to trained job, prints this information and also shares the tuning_job_name

    Inputs:
        tuner: tuner object from training function

    Outputs:
        tuning_job_name: name of the tuning job from trained hpo job
    """

    
    tuning_job_name = tuner.latest_tuning_job.name
    print(f"Latest hyperparameter job name : {tuning_job_name}")

    # Initialize HyperparameterTuningJobAnalytics to get the best training job
    tuner_analytics = HyperparameterTuningJobAnalytics(tuning_job_name)

    # Get the best training job details
    best_training_job = tuner_analytics.dataframe().sort_values('FinalObjectiveValue', ascending=True).iloc[0]

    best_training_job_name = best_training_job['TrainingJobName']
    print(f"Best training job: {best_training_job_name}")

    # Describe the best training job to get its details, including hyperparameters
    best_training_job_description = sagemaker_client.describe_training_job(TrainingJobName=best_training_job_name)

    # Access the hyperparameters
    best_hyperparameters = best_training_job_description.get('HyperParameters', {})
    print(f"Best hyperparameters: {best_hyperparameters}")

    return tuning_job_name

In [None]:
def fetch_hyperparameter_tuning_job(sagemaker_client):
    """
    This function is responsible for fetching the latest hyperparameter job trained, its best hyperparameters and the best estimator object. 
    So that you will not need to re-traing the job from scratch and you will be able to continue with other cells. 

    Inputs:
        sagemaker_client: client information from sagemaker
    Outputs:
        best_estimator: This is the best estimator object from best hyperparameters
        latest_hyperparameter_tuning_job_name: name of the latest hyperparameter job trained
    """

    # Get the list of hyperparameter tuning jobs
    tuning_jobs = sagemaker_client.list_hyper_parameter_tuning_jobs(
        NameContains='pyt-hpo-rev',
        SortBy='CreationTime',
        SortOrder='Descending',
        MaxResults=10
    )
    print(tuning_jobs)
    if not tuning_jobs['HyperParameterTuningJobSummaries']:
        raise Exception("No hyperparameter tuning jobs found")

    latest_tuning_job = tuning_jobs['HyperParameterTuningJobSummaries'][0]
    tuning_job_name = latest_tuning_job['HyperParameterTuningJobName']

    print(f"Latest tuning job: {tuning_job_name}")

    # Initialize HyperparameterTuningJobAnalytics to get the best training job
    tuner_analytics = HyperparameterTuningJobAnalytics(tuning_job_name)

    # Get the best training job details
    best_training_job = tuner_analytics.dataframe().sort_values('FinalObjectiveValue', ascending=True).iloc[0]

    best_training_job_name = best_training_job['TrainingJobName']
    print(f"Best training job: {best_training_job_name}")

    # Describe the best training job to get its details, including hyperparameters
    best_training_job_description = sagemaker_client.describe_training_job(TrainingJobName=best_training_job_name)

    # Access the hyperparameters
    best_hyperparameters = best_training_job_description.get('HyperParameters', {})
    print(f"Best hyperparameters: {best_hyperparameters}")

    # Describe the best training job to get necessary details
    best_training_job_description = sagemaker_client.describe_training_job(
        TrainingJobName=best_training_job_name
    )

    model_artifact = best_training_job_description['ModelArtifacts']['S3ModelArtifacts']

    # Create the estimator object using the best training job details
    best_estimator = Estimator(
        model_data=model_artifact,
        role=role,
        image_uri=best_training_job_description['AlgorithmSpecification']['TrainingImage'],
        instance_count=best_training_job_description['ResourceConfig']['InstanceCount'],
        instance_type=best_training_job_description['ResourceConfig']['InstanceType'],
        hyperparameters=best_hyperparameters
    )

    print(f"Best estimator created with model artifact: {model_artifact}")

    return best_estimator, tuning_job_name

In [None]:
def train_a_model_from_scratch(train, test, valid, role, best_hyperparameters, rules, profiler_config, debugger_config):
    """
    This function aims to train a prediction model from scratch with the best hyperparameters found before. It also runs with
    proper settings to apply a ProfilerReport at future stages.
    You need to set the proper PyTorch configurations inside this function, they are not embedded inside config dictionary

    Inputs:
        train: You need to give train data directory from S3 object
        test: You need to give test data directory from S3 object
        valid: You need to give validation data directory from S3 object
        role: get the role for sagemaker
        best_hyperparameters: get the best hyperparameters dicitonary
        rules: get rules dictionary
        profiler_config: get profiler configurations
        debugger_config: get debugger configurations

    Outputs:
        best_estimator: This is the best estimator object from best hyperparameters
        latest_hyperparameter_tuning_job_name: name of the latest hyperparameter job trained with this function
    """

    estimator = PyTorch(
        entry_point="train_revised.py",
        framework_version="1.12",
        base_job_name='pytorch-train-rev',
        py_version="py38",
        role=role,
        instance_count=2,
        instance_type="ml.g4dn.xlarge",
        hyperparameters=best_hyperparameters,
        rules=rules,
        profiler_config=profiler_config,
        debugger_hook_config=debugger_config,
    )

    estimator.fit(
        {
            "training": train, 
            "valid": valid,
            "test": test,
        }, 
        wait=True
    )

    training_job_name = estimator.latest_training_job.name
    print(f"Training jobname: {training_job_name}")

    return estimator, training_job_name

In [None]:
def fetch_a_trained_model():
    """
        This function is used to get the latest trained job on the sagemaker directory. You need to set base_job_name parameter
    inside this function. Also, other parameters if necessary

    Inputs:


    Outputs:
        estimator: returns the estimator object from the latest trained job
        latest_training_job_name: returns the training job name from the latest trained job
    """
    
    base_job_name = 'pytorch-train-rev'

    training_jobs = sagemaker_client.list_training_jobs(
        NameContains=base_job_name,
        StatusEquals='Completed',
        SortBy='CreationTime',
        SortOrder='Descending',
        MaxResults=10
    )

    latest_training_job_name = training_jobs['TrainingJobSummaries'][0]['TrainingJobName']
    print(f"Training jobname: {latest_training_job_name}")

    estimator = PyTorch.attach(latest_training_job_name)

    return estimator, latest_training_job_name

## Model Training
**TODO:** This is the part where you can train a model. The type or architecture of the model you use is not important. 

**Note:** You will need to use the `train.py` script to train your model.

## Hyperparameter Tuning
**TODO:** Here you can perform hyperparameter tuning to increase the performance of your model. You are encouraged to 
- tune as many hyperparameters as you can to get the best performance from your model
- explain why you chose to tune those particular hyperparameters and the ranges.

In [None]:
if notebook_config['run_hyperparameter_job'] == 1:
    best_estimator, tuner = train_hyperparameter_tuning_job(train, test, valid)
    tuning_job_name = train_hyp_tuning_job_summarizer(tuner)
elif notebook_config['run_hyperparameter_job'] == 0:
    best_estimator, tuning_job_name = fetch_hyperparameter_tuning_job(sagemaker_client)
else:
    print("Not a valid selection for run_hyperparameter_job parameter")

In [None]:
exp = HyperparameterTuningJobAnalytics(
    #hyperparameter_tuning_job_name='pytorch-training-240712-1957')
    hyperparameter_tuning_job_name=tuning_job_name)

jobs = exp.dataframe()

jobs.sort_values('FinalObjectiveValue', ascending=1)

In [None]:
best_hyperparameters = {
    "batch-size": int(best_estimator.hyperparameters()["batch-size"].replace('"', "")),
    "epochs": best_estimator.hyperparameters()["epochs"],
}

print(best_hyperparameters)

### Model Profiling and Debugging
**TODO:** Use model debugging and profiling to better monitor and debug your model training job.

In [None]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)

debugger_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval": "100", "eval.save_interval": "10"}
)

### Training and Evaluation / Multi Instance
**TODO:** We'll use multi-instance traiing to have a faster training process:

In [None]:
if notebook_config['train_a_model'] == 1:
    estimator, training_job_name = train_a_model_from_scratch(train, test, valid, 
                                    role, best_hyperparameters, rules, profiler_config, debugger_config)
elif notebook_config['train_a_model'] == 0:
    estimator, training_job_name = fetch_a_trained_model()
else:
    print("Not a valid selection for train_a_model parameter")

In [None]:
tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

We can now plot the debuggin output of our training process:

In [None]:
system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"]
)

For memory utilizations, we have an expected behavior since the training is going on GPU while we have evaluation phase on CPU. This increases data transfers between CPU and GPU. But because of budget limitation, we prefer to have these transfer instead of having an always on-GPU training and evalution.

For CPU and GPU utilizations, we can see some oscilations for both. The utilization oscillations for CPU have higher peeks which can be originated from the computaion load of model evaluaion processes. It also shows that most of times, we are in training instead of evalution; In other words, the training has a bigger portion than evalution in the whole time.

Since we have a multi-instance traiing with two instances, all of the paramters we mentioned above shuold be considered for both instances or nodes.

Now it's time to take a look at profiler.

In [None]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"You will find the profiler report in {rule_output_path}")

In [None]:
! aws s3 ls {rule_output_path} --recursive
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [None]:
# TODO: Display the profiler output
IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")

In [None]:
# Set up the serializers and deserializers
jpeg_serializer = IdentitySerializer("image/jpeg")
json_serializer = JSONSerializer()
json_deserializer = JSONDeserializer()

In [None]:
# Define the custom Predictor class
class ImagePredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(ImagePredictor, self).__init__(
            endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=jpeg_serializer,
            deserializer=json_deserializer,
        )

In [None]:
# Function to fetch the model location using training job name
def get_model_location(training_job_name):
    sagemaker_client = boto3.client('sagemaker')
    response = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
    model_artifacts = response['ModelArtifacts']['S3ModelArtifacts']
    return model_artifacts

In [None]:
# Fetch the model location automatically
model_location = get_model_location(training_job_name)
print(f"Model location: {model_location}")
print(f"Role: {role}")
print(estimator.model_data)

In [None]:
# Create a PyTorchModel object
pytorch_model = PyTorchModel(
    model_data=model_location,
    role=role,
    entry_point='inference.py',  # Ensure this is the correct path to your script
    py_version='py38',  # Ensure this matches the Python version used in training
    framework_version='1.12',  # Ensure this matches the PyTorch framework version used in training
    predictor_cls=ImagePredictor,
    env={
        'MODEL_PATH': 'model.pth',
        'NUM_CLASSES': '5',
        'IMAGE_SIZE': '224'
    }
)

In [None]:
# Deploy the model to an endpoint
#predictor = pytorch_model.deploy(initial_instance_count=1, instance_type='ml.m5.large')

# Deploy the model to an endpoint with a larger instance type
predictor = pytorch_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    timeout = 6000
    
)

In [None]:
def generate_presigned_url(bucket_name, object_key, expiration=3600):
    s3_client = boto3.client('s3')
    try:
        response = s3_client.generate_presigned_url('get_object',
                                                    Params={'Bucket': bucket_name,
                                                            'Key': object_key},
                                                    ExpiresIn=expiration)
    except Exception as e:
        print(e)
        return None

    return response

bucket_name = 'sagemaker-us-east-1-462848125024'
object_key = 'aws_capstone_project/valid_data/3/00037.jpg'

presigned_url = generate_presigned_url(bucket_name, object_key)
print(f"Pre-Signed URL: {presigned_url}")

In [None]:
# Prepare the image URL for inference
#request_dict = {
#    "url": "https://sagemaker-us-east-1-462848125024.s3.amazonaws.com/aws_capstone_project/valid_data/3/00037.jpg"
#}
# Load the image bytes
#img_bytes = requests.get(request_dict['url']).content
#print(img_bytes)

In [None]:
# Load the image bytes using the pre-signed URL
img_bytes = requests.get(presigned_url).content
img = Image.open(BytesIO(img_bytes)).convert('RGB')
img.show()

In [None]:
# Create the request dictionary
request_dict = {
    "url": presigned_url
}

In [None]:
# Make a prediction
response = predictor.predict(img_bytes, initial_args={"ContentType": "image/jpeg"})
#response = predictor.predict(json.dumps(request_dict), initial_args={"ContentType": "application/json"})
# Process the prediction response
print("Prediction response:", response)


In [None]:
# Clean up (delete) the endpoint when done to avoid unnecessary costs
predictor.delete_endpoint()