## Set up the environment

In [18]:
import time
import os
import sagemaker
import boto3
from sagemaker import get_execution_role
#from sagemaker.tensorflow import TensorFlow
#from sagemaker.pytorch.estimator import PyTorch
from sagemaker.pytorch import PyTorch
from sagemaker.analytics import ExperimentAnalytics

boto3_session = boto3.Session()
sm_client = boto3_session.client('sagemaker')
sm_session = sagemaker.Session(boto_session=boto3_session, sagemaker_client=sm_client)
role = get_execution_role()

In [19]:
import sys
!{sys.executable} -m pip install sagemaker-experiments

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [20]:
bucket = 'ml-misc-modelstore'
prefix = 'summarisation/output'
model = 'distilbart-xsum-abc-v2'
#model = 'distilbart-xsum-abc-v2-sample'

In [21]:
#summary data
#input_data = 's3://ml-misc-modelstore/summarisation/data/abc_summarisation_dataset-v2-140720-sample/'
input_data = 's3://ml-misc-modelstore/summarisation/data/abc_summarisation_dataset-v2-140720/'

In [22]:
output_path = f's3://{bucket}/{prefix}/{model}'
output_path

's3://ml-misc-modelstore/summarisation/output/distilbart-xsum-abc-v2-sample'

## Run on SageMaker cloud

SageMaker can get training metrics directly from the logs and send them to CloudWatch metrics.

In [23]:
job_name = 'hf-sum-finetuning'
metric_definition = [
    {'Name': 'test_avg_rouge1', 'Regex': ".*test_avg_rouge1': ([0-9\\.]+),.*"},
    {'Name': 'test_avg_rouge2', 'Regex': ".*test_avg_rouge2': ([0-9\\.]+),.*"},
    {'Name': 'test_avg_rougeL', 'Regex': ".*test_avg_rougeL': ([0-9\\.]+),.*"}
]
tags = [{'Key':'product','Value':'mlai'}, {'Key':'environment', 'Value':'development'}, {'Key':'owner', 'Value':'ML/AI team'}]
hyperparameters = {
    #from sh file 
    'learning_rate': 3e-5,
#    'fp16': True, 
    'gpus': 1, 
#    'do_train': True, 
#    'do_predict': True,
    'n_val': 1000, 
    #'n_val': 32,  ##*****************testing only
    'val_check_interval': 0.1,  
    
    
    #from github instructions
    'data_dir': '/opt/ml/input/data/data/',
    #'train_batch_size': 8,  
    #'eval_batch_size': 8,
    'train_batch_size': 16,
    'eval_batch_size': 16,
    'output_dir': '/opt/ml/model/', 
    'num_train_epochs': 1, ############
    'model_name_or_path': 'sshleifer/distilbart-xsum-12-6', 
    'max_target_length': 78,
    'val_max_target_length': 78,
    'test_max_target_length': 130
}

In [24]:
#spot instance params
train_use_spot_instances = True
train_max_run=3600*24
train_max_wait = 3600*24 if train_use_spot_instances else None

In [25]:
%%writefile examples/summary_entry_point.py 

#NOT using this for now, will go straight to finetuning.py 
import os
import argparse
import torch
import subprocess
print("this is a test")
print(f"current dir: {os.getcwd()}")
#print(os.popen("unzip /opt/ml/input/data/zipped/abc_summarisation_dataset-v2-140720-sample.zip -d /opt/ml/input/data/").read())
#os.system("unzip /opt/ml/input/data/zipped/abc_summarisation_dataset-v2-140720-sample.zip -d /opt/ml/input/data/")
#os.system("askldjlkasjdiwqler")

'''
p = subprocess.Popen(["conda", "install", "-y", "-c conda-forge", "unzip"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for i in p.stdout:
   print(i)

p = subprocess.Popen(["unzip", "/opt/ml/input/data/zipped/abc_summarisation_dataset-v2-140720-sample.zip", "-d", "/opt/ml/input/data/"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for i in p.stdout:
   print(i)
'''

import pandas as pd

def readlinestodf(filepath):
    with open(filepath, 'r', encoding='utf-8') as dat:
        lines = dat.readlines()
    return pd.DataFrame(lines, columns=['text'])


data = readlinestodf('/opt/ml/input/data/data/train.target')
print(data.shape)
data = readlinestodf('/opt/ml/input/data/data/train.source')
print(data.shape)
data = readlinestodf('/opt/ml/input/data/data/train.title')
print(data.shape)

#print(os.popen("ls /opt/ml/ -lR").read())      
#print(os.popen("pip list").read()) 

print(f"torch version {torch.__version__}")
print(f"GPU count: {torch.cuda.device_count()}")
print(os.popen("nvidia-smi").read())

Overwriting examples/summary_entry_point.py


In [26]:
source_dir = os.path.join(os.getcwd(), 'examples')

In [27]:
estimator = PyTorch(base_job_name=job_name,                       
                       entry_point='seq2seq/finetune.sh',                    
                       #entry_point='summary_entry_point.py',
                       source_dir=source_dir,
                       code_location=output_path,
                       role=role,
                       framework_version='1.5.0',
                       py_version='py3',
                       hyperparameters=hyperparameters,
                       output_path=output_path, 
                       train_instance_count=1, 
                       train_instance_type='ml.p3.2xlarge', #for actual training
                       #train_instance_type='ml.p2.xlarge', #for gpu testing
                       #train_instance_type='ml.m5.large', # for general testing
                       #train_instance_type = 'local_gpu', #for local mode
                       tags=tags, 
                       train_use_spot_instances=train_use_spot_instances,
                       train_max_run=train_max_run,
                       train_max_wait=train_max_wait, 
                       metric_definitions=metric_definition)

In [28]:
#remote_inputs = {'train' : dataset_location+'/train', 'validation' : dataset_location+'/validation', 'eval' : dataset_location+'/eval'}
remote_inputs = {'data': input_data}
estimator.fit(remote_inputs, wait=True)

INFO:sagemaker:Creating training-job with name: hf-sum-finetuning-2020-08-05-23-18-54-530


2020-08-05 23:18:56 Starting - Starting the training job...
2020-08-05 23:18:58 Starting - Launching requested ML instances......
2020-08-05 23:20:04 Starting - Preparing the instances for training.........
2020-08-05 23:21:51 Downloading - Downloading input data
2020-08-05 23:21:51 Training - Downloading the training image......
2020-08-05 23:22:55 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-05 23:22:56,941 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-05 23:22:56,966 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-05 23:22:56,970 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-08-05 23:22:57,346 sagemaker-containers INFO     Module default_user_module_name does not p