In [1]:
import sagemaker.huggingface
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)

from data_modules import TrainDataModule
from model import ModelForTokenClassification

In [None]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [2]:
model_name = "microsoft/deberta-v3-base"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
backbone = AutoModel.from_pretrained(model_name)
model = ModelForTokenClassification(backbone, config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT e

In [None]:
data_module = TrainDataModule(
    tokenizer,
    feature_file="data/features.csv",
    annotation_file="data/train.csv",
    notes_file="data/patient_notes.csv",
)
train_dataset, test_dataset = data_module.setup()

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  
s3_prefix = 'samples/datasets/emotion'

# save train_dataset to s3
train_data_dir = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(train_data_dir, fs=s3)

# save test_dataset to s3
test_data_dir = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_data_dir, fs=s3)

In [None]:
from sagemaker.huggingface import HuggingFace

hyperparameters={
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 8,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
    "num_train_epochs": 5,
    "group_by_length": True,
    "evaluation_strategy": "epoch",
    "save_strategy": "no",
}

In [None]:
huggingface_estimator = HuggingFace(
    entry_point="train.py",
    source_dir="./scripts",
    instance_type="ml.p3.2xlarge",
    instance_count=1,
    role=role,
    transformers_version="4.19.4",
    pytorch_version="1.9.1",
    py_version="py38",
    hyperparameters=hyperparameters
)

In [None]:
huggingface_estimator.fit({
    "train": train_data_dir,
    "test": test_data_dir,
})