# Development Environment and Permissions 

In [1]:
!pip install -q "sagemaker>=2.48.0" "transformers==4.12.3" "datasets[s3]==1.18.3" --upgrade

distutils: /opt/conda/include/python3.6m/UNKNOWN
sysconfig: /opt/conda/include/python3.6m[0m
user = False
home = None
root = None
prefix = None[0m


In [2]:
!pip install -q transformers torch
!pip install -q sentencepiece

distutils: /opt/conda/include/python3.6m/UNKNOWN
sysconfig: /opt/conda/include/python3.6m[0m
user = False
home = None
root = None
prefix = None[0m
distutils: /opt/conda/include/python3.6m/UNKNOWN
sysconfig: /opt/conda/include/python3.6m[0m
user = False
home = None
root = None
prefix = None[0m


In [3]:
import sagemaker.huggingface

## Permissions

_If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it._

In [4]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

# print(f"sagemaker role arn: {role}")
# print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker session region: us-east-1


## Create an Estimator and start a training job

In [5]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1, 'model_name':'neuroscience_to_dev_bio'}

In [6]:
huggingface_estimator = HuggingFace(entry_point='neuroscience_to_dev_bio.py',
                                    source_dir='./scripts',
                                    instance_type='ml.g4dn.16xlarge',
                                    instance_count=1,
                                    role=role,
                                    transformers_version='4.12',
                                    pytorch_version='1.9',
                                    py_version='py38',
                                    hyperparameters = hyperparameters,
                                    volume_size=900)

In [7]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit()

2022-06-05 17:24:35 Starting - Starting the training job...
2022-06-05 17:24:59 Starting - Preparing the instances for trainingProfilerReport-1654449875: InProgress
.........
2022-06-05 17:26:19 Downloading - Downloading input data
2022-06-05 17:26:19 Training - Downloading the training image..........................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-06-05 17:30:41,365 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-06-05 17:30:41,384 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-06-05 17:30:41,390 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-06-05 17:30:41,847 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.8 -m pip install -r requirements.txt[0m
[34mColle

UnexpectedStatusException: Error for Training job huggingface-pytorch-training-2022-06-05-17-24-35-310: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 134
ErrorMessage "OSError:
 [Errno 28] No space left on device  During handling of the above exception, another exception occurred: Traceback (most recent call last):   File "neuroscience_to_dev_bio.py", line 327, in <module> trainer.train() File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 1391, in train self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 1495, in _maybe_log_save_evaluate self._save_checkpoint(model, trial, metrics=metrics) File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 1587, in _save_checkpoint torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))   File "/opt/conda/lib/python3.8/site-packages/torch/serialization.py", line 380, in save return File "/opt/conda/lib/python3.8/site-packages/torch/serialization.py", line 259, in __exit__ self.file_like.write_end

## Deploying the endpoint

To deploy our endpoint, we call `deploy()` on our HuggingFace estimator object, passing in our desired number of instances and instance type.

In [None]:
predictor = huggingface_estimator.deploy(1,"ml.g4dn.xlarge")

Then, we use the returned predictor object to call the endpoint.

In [None]:
sentiment_input= {"inputs":"Testing"}

print('Predicting...')
print(predictor.predict(sentiment_input))

Finally, we delete the endpoint again.

In [None]:
predictor.delete_endpoint()

# Extras

### Estimator Parameters

In [None]:
# container image used for training job
print(f"container image used for training job: \n{huggingface_estimator.image_uri}\n")

# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{huggingface_estimator.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{huggingface_estimator.latest_training_job.name}\n")



In [None]:
# access the logs of the training job
huggingface_estimator.sagemaker_session.logs_for_job(huggingface_estimator.latest_training_job.name)

### Attach to old training job to an estimator 

In Sagemaker you can attach an old training job to an estimator to continue training, get results etc..

In [None]:
from sagemaker.estimator import Estimator

# job which is going to be attached to the estimator
old_training_job_name=''

In [None]:
# attach old training job
huggingface_estimator_loaded = Estimator.attach(old_training_job_name)

# get model output s3 from training job
huggingface_estimator_loaded.model_data