In [1]:
%%capture
!pip install -U sagemaker

In [2]:
%%capture
!pip install "sagemaker>=2.48.0" "transformers==4.6.1" "datasets[s3]==1.6.2" --upgrade
#!apt install git-lfs

In [8]:
import sagemaker
from sagemaker import get_execution_role
import sagemaker.huggingface

In [9]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()

In [10]:
print(f"sagemaker role arn: {role}")
# print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::197614225699:role/bi-sagemaker-access
sagemaker session region: us-east-1


In [11]:
%store -r bucket

In [12]:
training_input_path = f's3://{bucket}/processing_output/train_data'
val_input_path = f's3://{bucket}/processing_output/validation_data'

In [13]:
val_input_path

's3://az-ade-197614225699/processing_output/validation_data'

### Set up Huggingface training job

In [14]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 10,
                 'train_batch_size': 16,
                 'model_name':'distilbert-base-uncased',
                 'do_eval': True,
                 'load_best_model_at_end':True
                 }

#configuration for running training on smdistributed Data Parallel
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True}}}


In [15]:
# # instance configurations
# instance_type='ml.p3dn.24xlarge'
# instance_count=1
# volume_size=1000

In [16]:
huggingface_estimator = HuggingFace(
    entry_point='train.py',
    source_dir='./scripts',
    instance_type='ml.p3.16xlarge',
    instance_count=1,
    volume_size=1000,
    role=role,
    transformers_version='4.6',
    pytorch_version='1.7',
    py_version='py36',
    output_path=f's3://{bucket}/training_output/',
    base_job_name="az-ade-training",
    hyperparameters=hyperparameters,
    disable_profiler=True,
    distribution=distribution,
)

In [None]:
import time

start = time.time()


# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'val': val_input_path})

end = time.time()

print(end - start)

2021-11-18 18:46:36 Starting - Starting the training job...
2021-11-18 18:46:48 Starting - Launching requested ML instances.........
2021-11-18 18:48:36 Starting - Preparing the instances for training.........
2021-11-18 18:50:01 Downloading - Downloading input data...
2021-11-18 18:50:28 Training - Downloading the training image.................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-11-18 18:53:16,843 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-11-18 18:53:16,921 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-11-18 18:53:19,947 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel[0m
[34m2021-11-18 18:53:19,948 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-11-18 18:53:20,433 sagemaker-training-toolkit INFO   


2021-11-18 18:53:16 Training - Training image download completed. Training in progress.[34m[1,7]<stdout>:2021-11-18 18:53:26,248 - __main__ - INFO -  loaded train_dataset length is: 14627[0m
[34m[1,7]<stdout>:2021-11-18 18:53:26,249 - __main__ - INFO -  loaded val_dataset length is: 3134[0m
[34m[1,1]<stdout>:2021-11-18 18:53:26,248 - __main__ - INFO -  loaded train_dataset length is: 14627[0m
[34m[1,1]<stdout>:2021-11-18 18:53:26,249 - __main__ - INFO -  loaded val_dataset length is: 3134[0m
[34m[1,2]<stdout>:2021-11-18 18:53:26,248 - __main__ - INFO -  loaded train_dataset length is: 14627[0m
[34m[1,2]<stdout>:2021-11-18 18:53:26,249 - __main__ - INFO -  loaded val_dataset length is: 3134[0m
[34m[1,3]<stdout>:2021-11-18 18:53:26,248 - __main__ - INFO -  loaded train_dataset length is: 14627[0m
[34m[1,3]<stdout>:2021-11-18 18:53:26,249 - __main__ - INFO -  loaded val_dataset length is: 3134[0m
[34m[1,4]<stdout>:2021-11-18 18:53:26,248 - __main__ - INFO -  loaded train

[34m[1,0]<stdout>:{'eval_loss': 0.28335246443748474, 'eval_accuracy': 0.8857689853222719, 'eval_f1': 0.7055921052631579, 'eval_precision': 0.7447916666666666, 'eval_recall': 0.6703125, 'eval_runtime': 2.5723, 'eval_samples_per_second': 1218.387, 'epoch': 1.0}[0m
[34m[1,0]<stdout>:{'eval_loss': 0.1827087551355362, 'eval_accuracy': 0.923739629865986, 'eval_f1': 0.798990748528175, 'eval_precision': 0.8652094717668488, 'eval_recall': 0.7421875, 'eval_runtime': 2.5453, 'eval_samples_per_second': 1231.274, 'epoch': 2.0}[0m
[34m[1,0]<stdout>:{'eval_loss': 0.16968166828155518, 'eval_accuracy': 0.9320357370772177, 'eval_f1': 0.8418708240534521, 'eval_precision': 0.801980198019802, 'eval_recall': 0.8859375, 'eval_runtime': 2.5507, 'eval_samples_per_second': 1228.661, 'epoch': 3.0}[0m
[34m[1,0]<stdout>:{'eval_loss': 0.2614574730396271, 'eval_accuracy': 0.9042756860242501, 'eval_f1': 0.8007968127490039, 'eval_precision': 0.6963048498845266, 'eval_recall': 0.9421875, 'eval_runtime': 2.5181, 

In [19]:
print(huggingface_estimator.model_data)

s3://az-ade-197614225699/training_output/az-ade-training-2021-11-18-18-46-36-028/output/model.tar.gz


### Save training job name for next session

In [20]:
training_job_name = huggingface_estimator.latest_training_job.name
training_job_name

'az-ade-training-2021-11-18-18-46-36-028'

In [21]:
%store training_job_name

Stored 'training_job_name' (str)


In [2]:
#### Upload the fine-tuned model to huggingface.co

In [22]:
import sagemaker.huggingface
import botocore
from datasets.filesystems import S3FileSystem

In [23]:
import sagemaker
print(sagemaker.__version__)

2.63.2


In [24]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
#sagemaker_session_bucket="samsum-dataset"

role = sagemaker.get_execution_role()
sess = sagemaker.Session()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::197614225699:role/bi-sagemaker-access
sagemaker bucket: sagemaker-us-east-1-197614225699
sagemaker session region: us-east-1


In [25]:
import os
import tarfile
from sagemaker.s3 import S3Downloader

local_path = 'my_destilbert_model_ADEs'

os.makedirs(local_path, exist_ok = True)

# download model from S3
S3Downloader.download(
    s3_uri=huggingface_estimator.model_data, # s3 uri where the trained model is located
    local_path=local_path, # local path where *.targ.gz is saved
    sagemaker_session=sess # sagemaker session used for training the model
)

# unzip model
tar = tarfile.open(f"{local_path}/model.tar.gz", "r:gz")
tar.extractall(path=local_path)
tar.close()
os.remove(f"{local_path}/model.tar.gz")

In [26]:
import json
import numpy as np
import pandas as pd
from datetime import datetime, date, timedelta
import os
import math
import warnings 
from datetime import datetime, date, timedelta

In [28]:
# # read eval and test results 
# with open(f"{local_path}/eval_results.json") as f:
#     eval_results_raw = json.load(f)
#     print(eval_results_raw)
#     df_results = pd.json_normalize(eval_results_raw)