In [1]:
# %%capture
# import IPython
# !conda install -c conda-forge ipywidgets -y
# IPython.Application.instance().kernel.do_shutdown(True)

In [2]:
# %pip install datasets[s3]

In [62]:
from sagemaker.estimator import Estimator

from datasets import Dataset
from transformers import AutoTokenizer
from datasets.filesystems import S3FileSystem
from sagemaker.huggingface import HuggingFace
from sagemaker.huggingface.model import HuggingFaceModel
from sagemaker import TrainingJobAnalytics
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

In [73]:
import os
import sagemaker
import pandas as pd
import awswrangler as wr
import numpy as np
import botocore
import string
import random
import re
import gc
import torch
from datetime import datetime
import awswrangler as wr

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

sess = sagemaker.Session()
sagemaker_session_bucket = 'sagemaker-godeltech'
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

#put SageMaker role here if you're running this notebook locally
role = 


sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker bucket: sagemaker-godeltech
sagemaker session region: eu-west-1


In [74]:
 ##HERE WILL BE THE VARIABLES
SEED = 1234
TODAY = datetime.today().strftime("%Y%m%d")
TRAIN_PATH = f"s3://{sagemaker_session_bucket}/data/train/train.csv"
VAL_PATH = f"s3://{sagemaker_session_bucket}/data/validate/validate.csv"
TEST_PATH = f"s3://{sagemaker_session_bucket}/data/test/test.csv"
S3_PREFIX = "transformers"
CHECKPOINT_URI = f's3://{sagemaker_session_bucket}/{S3_PREFIX}/checkpoints'
OUTPUT_PATH = f's3://{sagemaker_session_bucket}/{S3_PREFIX}/outputs_{TODAY}'

MODEL = "distilbert-base-uncased"
INSTANCE = "ml.g5.xlarge" # G5 is a NVIDIA A10G

In [75]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()
gc.collect()
torch.cuda.empty_cache()

In [76]:
train = wr.s3.read_csv([TRAIN_PATH])
val = wr.s3.read_csv([VAL_PATH])
test = wr.s3.read_csv([TEST_PATH])

In [77]:
train.shape, val.shape, test.shape

((1443900, 2), (360975, 2), (194641, 12))

In [50]:
train['toxicity'] = train['toxicity'].astype('int')
val['toxicity'] = val['toxicity'].astype('int')
test_text = test[['comment_text']]

In [51]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, cache_dir = '../tmp/AutoTokenizer');

# create tokenization function
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding="max_length", truncation=True)

# tokenize train and test datasets
train_dataset = Dataset.from_pandas(train).map(tokenize, batched=True)
val_dataset = Dataset.from_pandas(val).map(tokenize, batched=True)
test_dataset = Dataset.from_pandas(test_sample).map(tokenize, batched=True)

# set dataset format for PyTorch
train_dataset =  train_dataset.rename_column("toxicity", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset = val_dataset.rename_column("toxicity", "labels")
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

  0%|          | 0/1444 [00:00<?, ?ba/s]

  0%|          | 0/361 [00:00<?, ?ba/s]

  0%|          | 0/195 [00:00<?, ?ba/s]

In [16]:
s3 = S3FileSystem()

# save train_dataset to S3
training_input_path = f's3://{sess.default_bucket()}/{S3_PREFIX}/train'
# train_dataset.save_to_disk(training_input_path,fs=s3)

# save val_dataset to S3
val_input_path = f's3://{sess.default_bucket()}/{S3_PREFIX}/validate'
# val_dataset.save_to_disk(val_input_path,fs=s3)

In [17]:
hyperparameters={
    "epochs": 1,                            # number of training epochs
    "train_batch_size": 32,                 # training batch size
    "model_name": MODEL,                    # name of pretrained model
    'do_train': True,
    'do_eval': True,
    'output_dir':'/opt/ml/checkpoints'
}

# # configuration for running training on smdistributed Data Parallel
# distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

# define metrics definitions
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_roc_auc', 'Regex': "'eval_roc_auc': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}
                ]

In [18]:
huggingface_estimator = HuggingFace(
        entry_point="transformers_trainer.py",      # fine-tuning script to use in training job
        source_dir="./",                            # directory where fine-tuning script is stored
        instance_type=INSTANCE,                     # instance type (parallelism supports only by P3-family of GPUs)
        instance_count=1,                           # number of instances
        role=role,                                  # IAM role used in training job to acccess AWS resources (S3)
        checkpoint_s3_uri=CHECKPOINT_URI,
        output_path=OUTPUT_PATH,
        use_spot_instances=True,
        save_steps = 5000,
        max_wait=6*60*60,                             # max_wait should be equal to or greater than max_run in seconds
        max_run=6*60*60,
        transformers_version = '4.12.3',            # the transformers version used in the training job
        pytorch_version      = '1.9.1',             # the pytorch_version version used in the training job
        py_version           = 'py38',              # the python version used in the training job
        metric_definitions=metric_definitions,
        hyperparameters=hyperparameters             # hyperparameters to use in training job
)

In [None]:
huggingface_estimator.fit({"train": training_input_path, "test": val_input_path}) 

In [16]:
# job which is going to be attached to the estimator
old_training_job_name='huggingface-pytorch-training-2022-09-20-07-31-55-400'

In [17]:
# attach old training job
huggingface_estimator_loaded = Estimator.attach(old_training_job_name)

 # container image used for training job
print(f"container image used for training job: \n{huggingface_estimator_loaded.image_uri}\n")

# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{huggingface_estimator_loaded.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{huggingface_estimator_loaded.latest_training_job.name}\n")


2022-09-20 10:48:26 Starting - Preparing the instances for training
2022-09-20 10:48:26 Downloading - Downloading input data
2022-09-20 10:48:26 Training - Training image download completed. Training in progress.
2022-09-20 10:48:26 Uploading - Uploading generated training model
2022-09-20 10:48:26 Completed - Training job completed
container image used for training job: 
763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04

s3 uri where the trained model is located: 
s3://sagemaker-godeltech/transformers/outputs_20220920/huggingface-pytorch-training-2022-09-20-07-31-55-400/output/model.tar.gz

latest training job name for this estimator: 
huggingface-pytorch-training-2022-09-20-07-31-55-400



In [24]:
# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=huggingface_estimator_loaded.latest_training_job.name).dataframe()

In [26]:
pivot_df = df.pivot(index=['timestamp'], columns=['metric_name'])['value'].reset_index()
pivot_df.sort_values(by='loss').head(5)

metric_name,timestamp,epoch,eval_f1,eval_loss,eval_roc_auc,learning_rate,loss
28,5580.0,0.9,,,,5.17906,0.1109
32,6360.0,0.94,,,,2.938013,0.1115
31,6180.0,0.93,,,,3.498274,0.1116
30,5940.0,0.92,,,,4.058536,0.1129
35,6960.0,0.98,,,,1.257227,0.1134


In [104]:
?huggingface_model.transformer

[0;31mSignature:[0m
[0mhuggingface_model[0m[0;34m.[0m[0mtransformer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0minstance_count[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstance_type[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrategy[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0massemble_with[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_path[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_kms_key[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maccept[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_concurrent_transforms[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_payload[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtags[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    