In [None]:
!(python3 train_ganomaly.py \
  \
  --epochs 1 \
  --batch_size 64 \
  --learning_rate 0.0002 \
  \
  --dataset_name mnist \
  --cache_path /tmp \
  --abnormal_class 2 \
  --image_size 32 \
  --image_channels 0 \
  --buffer_size 1000 \
  --shuffle y \
  --prefetch y \
  --random_brightness n \
  --random_crop n \
  --random_flip n \
  --repeat_dataset 0 \
  \
  --latent_size 100 \
  --n_extra_layers 0 \
  --n_filters 64 \
  --w_adv 1 \
  --w_rec 50 \
  --w_enc 1 \
  \
  --train_steps 1 \
  --eval_steps 1 \
  --log_level debug \
  --debug y \
  \
  --data_dir ./trainig/data \
  --model_dir ./trainig/model \
  --output_data_dir ./trainig/output \
 )

In [None]:
from sagemaker.tensorflow import TensorFlow
import sagemaker
import subprocess
import os

In [None]:
bucket = 'sagemaker-thesis-anomaly-detection'

artifacts_path = 'ganomaly/testing'
artifacts_path_s3 = 's3://{}/training/{}'.format(bucket, artifacts_path)

dataset_name = 'currently_unused'
dataset_path_s3 = 's3://{}/datasets/{}'.format(bucket, dataset_name)

In [None]:
hyperparameters = {
    # training params
    'epochs': 1,
    'batch_size': 64,
    'learning_rate': 0.0002,

    # tf.data piepline params
    'dataset_name': 'mnist',
    'cache_path': '/tmp/tfdata',
    'abnormal_class': 2,  # only valid for mnist, fashion_mnist, cifar10, cifar100 and stl10
    'image_size': 32,
    'image_channels': 0,  # only valid for MVTec AD
    'buffer_size': 1000,
    'shuffle': True,
    'prefetch': True,
    'random_flip': False,
    'random_crop': False,
    'random_brightness': False,
    'repeat_dataset': 0,

    # model params
    'latent_size': 100,
    'n_filters': 64,
    'n_extra_layers': 0,
    'w_adv': 1,
    'w_rec': 50,
    'w_enc': 1,

    # debugging params
    #'train_steps': None,
    #'eval_steps': None,
    'log_level': 'info',
    'debug': False,

    # input/output dir params
    # they are set through env vars
    #'data_dir': './trainig/data',
    #'model_dir': './trainig/model',
    #'output_data_dir': './trainig/output'
}

In [None]:
local_hyperparameters = hyperparameters.copy()
local_hyperparameters['epochs']      = 1
local_hyperparameters['train_steps'] = 1
local_hyperparameters['eval_steps']  = 1

# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
estimator = TensorFlow(
    entry_point = 'train_ganomaly.py',
    source_dir = os.getcwd(),
    role = sagemaker.get_execution_role(),
    framework_version = '2.3.0',
    py_version = 'py37',
    hyperparameters = local_hyperparameters,
    train_instance_count = 1,
    train_instance_type = 'local' if subprocess.call('nvidia-smi') != 0 else 'local_gpu',
    #train_max_run = 5 * 24 * 60 * 60 # 5 days
    #code_location = "file://" + artifacts_path_local,
    #output_path = "file://" + artifacts_path_local
    code_location = artifacts_path_s3,
    output_path = artifacts_path_s3,
    metric_definitions=[
        {'Name': 'auc(roc)', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): (.*?),'},
        {'Name': 'ptp_loss', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): .*?, ptp_loss: (.*?),'},
        {'Name': 'min_loss', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): .*?, ptp_loss: .*?, min_loss: (.*)'}
    ]
)

inputs = {
    'data_dir' : dataset_path_s3
}

estimator.fit(
    #inputs
)

In [None]:
cloud_hyperparameters = hyperparameters.copy()
cloud_hyperparameters['epochs']         = 100
cloud_hyperparameters['dataset_name']   = 'bottle'
cloud_hyperparameters['image_size']     = 128
cloud_hyperparameters['image_channels'] = 1
cloud_hyperparameters['random_flip']    = True
cloud_hyperparameters['random_crop']    = True
cloud_hyperparameters['repeat_dataset'] = 10
cloud_hyperparameters['latent_size']    = 300
cloud_hyperparameters['n_filters']      = 64

# https://aws.amazon.com/de/sagemaker/pricing/instance-types/
# ml.p3.2xlarge (cpus=8, gpus=1xV100, ram=61, gram=16) 3,823 USD pro Stunde
# ml.p2.xlarge (cpus=4, gpus=1xK80, ram=61, gram=12) 1,326 USD pro Stunde
# ml.g4dn.xlarge (cpus=4, gpus=1xT4, ram=16, gram=16) 0,658 USD pro Stunde
# ml.g4dn.2xlarge (cpus=8, gpus=1xT4, ram=32, gram=16) 0,94 USD pro Stunde
# ml.g4dn.4xlarge (cpus=16, gpus=1xT4, ram=64, gram=16) 1,505 USD pro Stunde
# ml.g4dn.12xlarge (cpus=48, gpus=4xT4, ram=192, gram=64) 4,89 USD pro Stunde
instance_type = "ml.g4dn.xlarge"

# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
estimator = TensorFlow(
    entry_point = 'train_ganomaly.py',
    source_dir = os.getcwd(),
    role = sagemaker.get_execution_role(),
    framework_version = '2.3.0',
    py_version = 'py37',
    hyperparameters = cloud_hyperparameters,
    train_instance_count = 1,
    train_instance_type = instance_type,
    #train_max_run = 5 * 24 * 60 * 60 # 5 day on "ml.g4dn.xlarge" -> 79,2 USD
    #code_location = "file://" + artifacts_path_local,
    #output_path = "file://" + artifacts_path_local
    code_location = artifacts_path_s3,
    output_path = artifacts_path_s3,
    metric_definitions=[
        {'Name': 'auc(roc)', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): (.*?),'},
        {'Name': 'ptp_loss', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): .*?, ptp_loss: (.*?),'},
        {'Name': 'min_loss', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): .*?, ptp_loss: .*?, min_loss: (.*)'}
    ]
)

inputs = {
    'data_dir' : dataset_path_s3
}

estimator.fit(
    #inputs,
    wait=False
)

In [None]:
trainings = [
    {   # Best Epoch 97: AUC(ROC): 0.93294
        'job_name': 'tensorflow-training-2020-10-05-11-41-25-079',
        'image_size': 128,
        'image_channels': 1,
        'latent_size': 300
    },
    {   # Best Epoch 92: AUC(ROC): 0.88611
        'job_name': 'tensorflow-training-2020-10-05-12-10-30-685',
        'image_size': 128,
        'image_channels': 3,
        'latent_size': 300
    },
    {   # Best Epoch 95: AUC(ROC): 0.88651
        'job_name': 'tensorflow-training-2020-10-05-13-42-52-527',
        'image_size': 128,
        'image_channels': 3,
        'latent_size': 300
    },
    {   # Best Epoch 96: AUC(ROC): 0.91865
        'job_name': 'tensorflow-training-2020-10-05-16-00-31-786',
        'image_size': 128,
        'image_channels': 1,
        'latent_size': 300
    },
]