## Set up the environment

In [5]:
import time
import os
import sagemaker
import boto3
from sagemaker import get_execution_role
#from sagemaker.tensorflow import TensorFlow
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.analytics import ExperimentAnalytics

boto3_session = boto3.Session()
sm_client = boto3_session.client('sagemaker')
sm_session = sagemaker.Session(boto_session=boto3_session, sagemaker_client=sm_client)
role = get_execution_role()

In [2]:
import sys
!{sys.executable} -m pip install sagemaker-experiments

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

Collecting sagemaker-experiments
  Downloading sagemaker_experiments-0.1.22-py3-none-any.whl (36 kB)
Installing collected packages: sagemaker-experiments
Successfully installed sagemaker-experiments-0.1.22
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
bucket = 'ml-misc-sagemaker'
prefix = 'sagemaker/script-mode'

In [8]:
#summary data
input_data = 's3://ml-misc-modelstore/summarisation/data/abc_summarisation_dataset-v2-140720-sample.zip'

## Run on SageMaker cloud

SageMaker can get training metrics directly from the logs and send them to CloudWatch metrics.

In [18]:
metric_definition = [
    {'Name': 'train:loss', 'Regex': '.*loss: ([0-9\\.]+) - acc: [0-9\\.]+.*'},
    {'Name': 'train:accuracy', 'Regex': '.*loss: [0-9\\.]+ - acc: ([0-9\\.]+).*'},
    {'Name': 'validation:accuracy', 'Regex': '.*step - loss: [0-9\\.]+ - acc: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_acc: ([0-9\\.]+).*'},
    {'Name': 'validation:loss', 'Regex': '.*step - loss: [0-9\\.]+ - acc: [0-9\\.]+ - val_loss: ([0-9\\.]+) - val_acc: [0-9\\.]+.*'},
    {'Name': 'sec/steps', 'Regex': '.* - \d+s (\d+)[mu]s/step - loss: [0-9\\.]+ - acc: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_acc: [0-9\\.]+'}
]
tags = [{'Key':'product','Value':'mlai'}, {'Key':'environment', 'Value':'development'}, {'Key':'owner', 'Value':'ML/AI team'}]
hyperparameters = {'epochs': 10, 'batch-size' : 256}

In [19]:
train_use_spot_instances = True
train_max_run=3600
train_max_wait = 3600 if train_use_spot_instances else None

In [20]:
%%writefile examples/summary_entry_point.py

import os
import argparse
print("this is a test")

Overwriting examples/summary_entry_point.py


In [26]:
source_dir = os.path.join(os.getcwd(), 'examples')

In [27]:
estimator = PyTorch(base_job_name='TEST_hf_summarization_finetuning',
                       entry_point='summary_entry_point.py',
                       source_dir=source_dir,
                       role=role,
                       framework_version='0.4',
                       py_version='py3',
                       hyperparameters=hyperparameters,
                       train_instance_count=1, 
                       #train_instance_type='ml.p3.8xlarge',
                       #train_instance_type='ml.p2.xlarge', #for gpu testing
                       train_instance_type='ml.c5.xlarge', # for general testing
                       tags=tags, 
                       train_use_spot_instances=train_use_spot_instances,
                       train_max_run=train_max_run,
                       train_max_wait=train_max_wait, 
                       metric_definitions=metric_definition)

In [None]:
#remote_inputs = {'train' : dataset_location+'/train', 'validation' : dataset_location+'/validation', 'eval' : dataset_location+'/eval'}
remote_inputs = {'data': input_data}
estimator.fit(remote_inputs, wait=True)