In [None]:
import os
import sys
import logging
import torch

import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker.pytorch import PyTorch
from botocore.exceptions import ClientError

In [None]:
session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()
region = "eu-central-1"
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

train_input_path = f"s3://{bucket}/imdb/data/small/train.csv"
test_input_path = f"s3://{bucket}/imdb/data/small/test.csv"

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

## Training

In [None]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [None]:
hyperparameters = {
    "epochs": 1,
    "train_batch_size": 32,
    "model_name": "distilbert-base-uncased",
}
estimator = PyTorch(
    entry_point="train.py",
    source_dir="source",
    role=role,
    framework_version="1.7.1",
    py_version="py3",
    instance_count=1,
    instance_type="local",
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions)
estimator.fit({'train': train_input_path, 'test': test_input_path})

## Accessing Training Metrics

The training job doesn't emit metrics immediately. For example, it first needs to provision a training instance, download the training image, download the data. Additionally in this demo the first evaluation logs come after 500 steps (default in the Hugging Face trainer https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments).

Hence, **run the below section 15 to 20 minutes after launching the training, otherwise it may not have available metrics yet and return an error**

Note that you can also copy this code and run it from a different place (as long as connected to the cloud and authorized to use the API), by specifiying the exact training job name in the `TrainingJobAnalytics` API call.)

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

We can also plot some of the metrics collected

*Note: the plot below were generated at the end of the training job, with metrics available for all training duration*

In [None]:
!pip install seaborn

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = [15,5]

In [None]:
evals = df[df.metric_name.isin(['eval_accuracy','eval_precision'])]
losses = df[df.metric_name.isin(['loss', 'eval_loss'])]

sns.lineplot(
    x='timestamp', 
    y='value', 
    data=evals, 
    hue='metric_name', 
    palette=['blue', 'purple'])

ax2 = plt.twinx()
sns.lineplot(
    x='timestamp', 
    y='value', 
    data=losses, 
    hue='metric_name', 
    palette=['orange', 'red'],
    ax=ax2)