## Settings

In [None]:
bucket = 'sagemaker-thesis-anomaly-detection'

artifacts_path = 'ganomaly/master'
artifacts_path_s3 = 's3://{}/training/{}'.format(bucket, artifacts_path)

## Training

In [None]:
!(python3 train.py \
  \
  --epochs 1 \
  --batch_size 64 \
  --learning_rate 0.0002 \
  --early_stopping_patience 100 \
  --reduce_lr_patience 0 \
  \
  --dataset_name mnist \
  --cache_path /tmp \
  --abnormal_class 2 \
  --image_size 32 \
  --image_channels 0 \
  --buffer_size 1000 \
  --shuffle y \
  --prefetch y \
  --random_brightness n \
  --random_crop n \
  --random_flip n \
  --repeat_dataset 0 \
  \
  --model_name ganomaly \
  --latent_size 100 \
  --intermediate_size 0 \
  --n_filters 64 \
  --n_extra_layers 0 \
  --w_adv 1 \
  --w_rec 50 \
  --w_enc 1 \
  \
  --train_steps 1 \
  --eval_steps 1 \
  --log_level debug \
  --debug y \
  \
  --data_dir ./trainig/data \
  --model_dir ./trainig/model \
  --output_data_dir ./trainig/output \
 )

In [None]:
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
import sagemaker
import subprocess
import os

In [None]:
hyperparameters = {
    # training params
    'epochs': 1,
    'batch_size': 64,
    'learning_rate': 0.0002,
    'early_stopping': 100,
    'reduce_lr_patience': 0,

    # tf.data piepline params
    'dataset_name': 'mnist',
    'cache_path': '/tmp/tfdata',
    'abnormal_class': 2,  # only valid for mnist, fashion_mnist, cifar10, cifar100 and stl10
    'image_size': 32,
    'image_channels': 0,  # only valid for MVTec AD
    'buffer_size': 1000,
    'shuffle': True,
    'prefetch': True,
    'random_flip': False,
    'random_crop': False,
    'random_brightness': False,
    'repeat_dataset': 0,

    # model params
    'model_name': 'ganomaly',   # cae, cnae, cvae, ganomaly
    'latent_size': 100,
    'intermediate_size': 0,     # only valid for cvae
    'n_filters': 64,
    'n_extra_layers': 0,
    'w_adv': 1,                 # only valid for GANomaly
    'w_rec': 50,                # only valid for GANomaly
    'w_enc': 1,                 # only valid for GANomaly

    # debugging params
    'train_steps': 0,
    'eval_steps': 0,
    'log_level': 'info',
    'debug': False,

    # input/output dir params
    # they are set through env vars
    #'data_dir': './trainig/data',
    #'model_dir': './trainig/model',
    #'output_data_dir': './trainig/output'
}

In [None]:
mvtec_ad_ds_name = [
    "bottle", "cable", "capsule", "carpet", "grid",
    "hazelnut", "leather", "metal_nut", "pill", "screw",
    "tile", "toothbrush", "transistor", "wood", "zipper"
]

In [None]:
metric_definitions = [
    # ADModelEvaluator
    {'Name': 'best auc(roc)', 'Regex': 'Best Epoch [0-9]+: AUC\(ROC\): (.*?),'},
    {'Name': 'auc(roc)', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): (.*?),'},
    {'Name': 'ptp_loss', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): .*?, ptp_loss: (.*?),'},
    {'Name': 'min_loss', 'Regex': 'Curr Epoch [0-9]+: AUC\(ROC\): .*?, ptp_loss: .*?, min_loss: (.*)'},
    # Generic
    {'Name': 'loss', 'Regex': 'loss: ([0-9]+[\.,][0-9]+e?-?[0-9]*)'},
    {'Name': 'mean_squared_error', 'Regex': 'mean_squared_error: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'mean_absolute_error', 'Regex': 'mean_absolute_error: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'binary_crossentropy', 'Regex': 'binary_crossentropy: ([0-9]+[\.,][0-9]+)'},
    # Only CVAE
    {'Name': 'loss_total', 'Regex': 'loss_total: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'loss_kl', 'Regex': 'loss_kl: ([0-9]+[\.,][0-9]+)'},
    # Only GANomaly
    {'Name': 'loss_dis', 'Regex': 'loss_dis: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'loss_dis_fake', 'Regex': 'loss_dis_fake: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'loss_dis_real', 'Regex': 'loss_dis_real: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'loss_gen', 'Regex': 'loss_gen: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'loss_gen_adv', 'Regex': 'loss_gen_adv: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'loss_gen_enc', 'Regex': 'loss_gen_enc: ([0-9]+[\.,][0-9]+)'},
    {'Name': 'loss_gen_rec', 'Regex': 'loss_gen_rec: ([0-9]+[\.,][0-9]+)'},
]

In [None]:
local_hyperparameters = hyperparameters.copy()
local_hyperparameters['epochs']      = 1
local_hyperparameters['train_steps'] = 1
local_hyperparameters['eval_steps']  = 1

# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
estimator = TensorFlow(
    base_job_name='TRAIN-GANomaly-{}-{}-{}-{}'.format(
        local_hyperparameters['dataset_name'],
        local_hyperparameters['image_size'],
        local_hyperparameters['image_channels'],
        local_hyperparameters['latent_size']),
    entry_point = 'train_ganomaly.py',
    source_dir = os.getcwd(),
    role = sagemaker.get_execution_role(),
    framework_version = '2.3.0',
    py_version = 'py37',
    hyperparameters = local_hyperparameters,
    train_instance_count = 1,
    train_instance_type = 'local' if subprocess.call('nvidia-smi') != 0 else 'local_gpu',
    #train_max_run = 5 * 24 * 60 * 60 # 5 days
    #code_location = "file://" + artifacts_path_local,
    #output_path = "file://" + artifacts_path_local
    code_location = artifacts_path_s3,
    output_path = artifacts_path_s3,
    metric_definitions=metric_definitions
)

In [None]:
estimator.fit()

In [None]:
cloud_hyperparameters = hyperparameters.copy()
cloud_hyperparameters['epochs']         = 1000
cloud_hyperparameters['early_stopping'] = 200
cloud_hyperparameters['dataset_name']   = mvtec_ad_ds_name[0]
cloud_hyperparameters['image_size']     = 128
cloud_hyperparameters['image_channels'] = 3
cloud_hyperparameters['random_flip']    = True
cloud_hyperparameters['random_crop']    = True
cloud_hyperparameters['repeat_dataset'] = 10
cloud_hyperparameters['model_name']     = 'cae'
#cloud_hyperparameters['model_name']     = 'cnae'
#cloud_hyperparameters['model_name']     = 'cvae'
#cloud_hyperparameters['model_name']     = 'ganomaly'
cloud_hyperparameters['latent_size']    = 900
cloud_hyperparameters['intermediate_size'] = 0
cloud_hyperparameters['n_filters']      = 32

# https://aws.amazon.com/de/sagemaker/pricing/instance-types/
# ml.p3.2xlarge (cpus=8, gpus=1xV100, ram=61, gram=16) 3,823 USD pro Stunde
# ml.p2.xlarge (cpus=4, gpus=1xK80, ram=61, gram=12) 1,326 USD pro Stunde
# ml.g4dn.xlarge (cpus=4, gpus=1xT4, ram=16, gram=16) 0,658 USD pro Stunde
# ml.g4dn.2xlarge (cpus=8, gpus=1xT4, ram=32, gram=16) 0,94 USD pro Stunde
# ml.g4dn.4xlarge (cpus=16, gpus=1xT4, ram=64, gram=16) 1,505 USD pro Stunde
# ml.g4dn.12xlarge (cpus=48, gpus=4xT4, ram=192, gram=64) 4,89 USD pro Stunde
instance_type = "ml.g4dn.xlarge"

# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
estimator = TensorFlow(
    base_job_name='TRAIN-AD-{}-{}-{}-{}-{}-{}'.format(
        cloud_hyperparameters['model_name'],
        cloud_hyperparameters['dataset_name'],
        cloud_hyperparameters['image_size'],
        cloud_hyperparameters['image_channels'],
        cloud_hyperparameters['latent_size'],
        cloud_hyperparameters['n_filters']
    ),
    entry_point = 'train.py',
    source_dir = os.getcwd(),
    role = sagemaker.get_execution_role(),
    framework_version = '2.3.0',
    py_version = 'py37',
    hyperparameters = cloud_hyperparameters,
    instance_count = 1,
    instance_type = instance_type,
    #train_max_run = 5 * 24 * 60 * 60 # 5 day on "ml.g4dn.xlarge" -> 79,2 USD
    #code_location = "file://" + artifacts_path_local,
    #output_path = "file://" + artifacts_path_local
    code_location = artifacts_path_s3,
    output_path = artifacts_path_s3,
    metric_definitions=metric_definitions
)

In [None]:
estimator.fit(
    wait=False
)

In [None]:
hyperparameter_ranges = {
    #'batch_size': CategoricalParameter([32, 64, 128, 256]), #IntegerParameter(16, 128),
    #'learning_rate': CategoricalParameter([0.0001, 0.0002, 0.0004, 0.0008]), #ContinuousParameter(0.0002, 0.001),
    #'image_size': CategoricalParameter([32, 64, 128]),
    #'latent_size': CategoricalParameter([300, 600, 900, 1200, 1500]),
    'latent_size': CategoricalParameter([300, 900, 1500]),
    'n_filters': CategoricalParameter([32, 48, 64]),
    #'latent_size': CategoricalParameter([300, 600, 900]),
    #'intermediate_size': CategoricalParameter([300, 600, 900]),
    #'model_name': CategoricalParameter(['cae', 'cnae', 'cvae', 'ganomaly']),
}

objective_metric_name = 'best auc(roc)'
objective_type = 'Maximize' #'Minimize'
tuner_metric_definitions = metric_definitions

In [None]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    tuner_metric_definitions,
    max_jobs=9,
    max_parallel_jobs=1,
    objective_type=objective_type,
    base_tuning_job_name='TUNE-{}-{}'.format(
        cloud_hyperparameters['model_name'],
        cloud_hyperparameters['dataset_name']
    )
)

In [None]:
tuner.fit(
    wait=False
)

## Prediction

In [None]:
!(pip install --upgrade pip)
!(pip install --upgrade tensorflow==2.3.0)

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from packaging import version
assert version.parse('2.3') <= version.parse(tf.version.VERSION), "Tensorflow 2.3 or geater required"

from datasets.mvtec_ad import get_labeled_dataset
from models.cae import CAE
from models.cnae import CNAE
from models.cvae import CVAE
from models.ganomaly import Generator
from utils.plot import imshow

In [None]:
trainings = [
    {   # Best Epoch 505: AUC(ROC): 0.97262, ptp_loss: 0.00555, min_loss: 0.00093
        'job_name': 'TRAIN-AD-cae-bottle-128-3-900-32-2020-10-27-15-04-51-584',
        'model_name': 'cae',
        'dataset_name': 'bottle',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    {   # Best Epoch 192: AUC(ROC): 0.96786, ptp_loss: 0.00714, min_loss: 0.00158
        'job_name': 'TRAIN-AD-cnae-bottle-128-3-900-32-2020-10-28-13-58-31-220',
        'model_name': 'cnae',
        'dataset_name': 'bottle',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    {   # Best Epoch 296: AUC(ROC): 0.97460, ptp_loss: 0.00789, min_loss: 0.00094
        'job_name': 'TRAIN-AD-cvae-bottle-128-3-300-32-2020-10-28-14-00-26-567',
        'model_name': 'cvae',
        'dataset_name': 'bottle',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 300,
            'intermediate_size': 900,
            'n_filters': 32
        }
    },
    {   # Best Epoch 199: AUC(ROC): 0.97262, ptp_loss: 0.16334, min_loss: 0.01880
        'job_name': 'TRAIN-AD-ganomaly-bottle-128-3-900-32-2020-10-27-15-05-05-426',
        'model_name': 'ganomaly',
        'dataset_name': 'bottle',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    {   # Best Epoch 281: AUC(ROC): 0.99087, ptp_loss: 0.14609, min_loss: 0.00967
        'job_name': 'TUNE-ganomaly-bottle-201030-1353-001-481bda14',
        'model_name': 'ganomaly',
        'dataset_name': 'bottle',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 1500,
            'n_filters': 32
        }
    },
    ##############################################################################
    { # Best Epoch 1853: AUC(ROC): 0.97619, ptp_loss: 0.20830, min_loss: 0.01685
        'job_name': 'TRAIN-AD-ganomaly-bottle-128-3-900-32-2020-11-17-11-22-23-951',
        'model_name': 'ganomaly',
        'dataset_name': 'bottle',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    { # Best Epoch 2275: AUC(ROC): 0.85832, ptp_loss: 0.16517, min_loss: 0.03705
        'job_name': 'TRAIN-AD-ganomaly-cable-128-3-900-32-2020-11-17-11-23-07-556',
        'model_name': 'ganomaly',
        'dataset_name': 'cable',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    { # Best Epoch 293: AUC(ROC): 0.85162, ptp_loss: 0.11853, min_loss: 0.01167
        'job_name': 'TRAIN-AD-ganomaly-capsule-128-3-900-32-2020-11-17-15-08-38-457',
        'model_name': 'ganomaly',
        'dataset_name': 'capsule',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    { # Best Epoch 65: AUC(ROC): 0.23254, ptp_loss: 0.11752, min_loss: 0.03545, Note: labels inverted
        'job_name': 'TRAIN-AD-ganomaly-carpet-128-3-900-32-2020-11-17-15-09-27-957',
        'model_name': 'ganomaly',
        'dataset_name': 'carpet',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    { # Best Epoch 16: AUC(ROC): 0.00501, ptp_loss: 0.17825, min_loss: 0.16006, Note: labels inverted
        'job_name': 'TRAIN-AD-ganomaly-grid-128-3-900-32-2020-11-17-16-08-31-516',
        'model_name': 'ganomaly',
        'dataset_name': 'grid',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
    { # Best Epoch 03: AUC(ROC): 0.20250, ptp_loss: 0.00666, min_loss: 0.21312, Note: labels inverted
        'job_name': 'TRAIN-AD-ganomaly-hazelnut-128-3-900-32-2020-11-17-16-08-38-855',
        'model_name': 'ganomaly',
        'dataset_name': 'hazelnut',
        'model_args': {
            'input_shape': (128, 128, 3),
            'latent_size': 900,
            'n_filters': 32
        }
    },
]

In [None]:
training = trainings[-2]

model_switcher = {
    'cae': CAE,
    'cnae': CNAE,
    'cvae': CVAE,
    'ganomaly': Generator
}

model = model_switcher[training['model_name']](**training['model_args'])

model.build((None, *training['model_args']['input_shape']))
model.summary()

In [None]:
job_name = '' # empty means all
job_name = training['job_name']
!(aws s3 cp {artifacts_path_s3}/{job_name} /tmp/ganomaly/{job_name} --recursive --exclude "*" --include "*/model.tar.gz")
!(for f in $(find /tmp/ganomaly -iname model.tar.gz); do echo "Extracting ${f}"; tar xf ${f} -C ${f%/*}; rm ${f}; done)

In [None]:
model.load_weights('/tmp/ganomaly/{}/output/{}'.format(
    training['job_name'],
    'generator' if training['model_name'] == 'ganomaly' else ''
))

In [None]:
resize_image = lambda image, label: (tf.image.resize(image, training['model_args']['input_shape'][:2]), label)

test_ds = get_labeled_dataset(
    category=training['dataset_name'],
    split = 'test',
    image_channels=training['model_args']['input_shape'][-1],
    binary_labels=True
)
test_ds = test_ds.map(resize_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_ds = test_ds.cache('/tmp/tfdata_test_ds_{}_{}_{}.cache'.format(
    training['dataset_name'],
    training['model_args']['input_shape'][0],
    training['model_args']['input_shape'][-1]
))

In [None]:
predictions = model.predict(
    x=test_ds.batch(64)
)
print(predictions.shape)

In [None]:
min_val = np.min(predictions)
ptp_val = np.ptp(predictions)
print("ptp_val:", ptp_val, "min_val:", min_val)

predictions -= min_val
predictions /= ptp_val

#print(predictions)

In [None]:
a = []
n = []
for (i, l), p in zip(test_ds, predictions):
    if l == 1:
        a.append(p)
    else:
        n.append(p)
a = np.array(a)
n = np.array(n)
print('anomaly: {}; normal: {};'.format(a.shape, n.shape))

In [None]:
n_bins = 100
step_size = 1.0 / n_bins
bins = np.arange(0.0, 1.0+step_size, step_size)
step_size = 1.0 / 50
xticks = np.arange(0.0, 1.0+step_size, step_size)

plt.figure(figsize=(8,6))
plt.hist(a, bins=bins, alpha=0.5, label="anomaly")
plt.hist(n, bins=bins, alpha=0.5, label="normal")
plt.xlabel("Error", size=14)
plt.xticks(xticks, rotation = 90, size = 7)#, ha="right")
plt.ylabel("Count", size=14)
plt.title("Anomaly Detection Histogram", size=14)
plt.legend(loc='upper right')
plt.savefig("ganomaly_bottle_histogram.png")

In [None]:
for (i, l), p in zip(test_ds, predictions):
    imshow(i, "label: {}, pred: {}".format("good" if l == 0 else "broken", p), greyscale=True)

## Latent space visualization (t-SNE)

In [None]:
from sklearn.manifold import TSNE

def plot_tsne_clusters(x, y=None):
    print('min:', np.min(x), 'max:', np.max(x), 'mean:', np.mean(x), 'stddev:', np.std(x))
    tsne_space = TSNE(n_components=2, random_state=0).fit_transform(x)
    plt.figure(figsize=(12, 10))
    plt.scatter(tsne_space[:, 0], tsne_space[:, 1], c=y)
    plt.colorbar()
    plt.show()

if isinstance(model, CVAE):
    intermediate = model.net_enc.predict(test_ds.batch(64))
    z_mean, z_log_var, z = model.net_var.predict(intermediate)
    plot_tsne_clusters(z, [y for _, y in test_ds])

## Generate random images from random latent space

In [None]:
num_examples_to_generate = 20

if isinstance(model, CVAE):
    v = tf.random.normal(
        shape=[num_examples_to_generate, training['model_args']['latent_size']],
        mean=0.0, stddev=1.0
    )
    
    plot_tsne_clusters(v)

    predicted = tf.clip_by_value(model.net_dec.predict(v), 0.0, 1.0)
    print(predicted.shape)

    plt.figure(figsize=(20,20))
    for i in range(num_examples_to_generate):
        plt.subplot(5,5,i+1)
        plt.imshow(predicted[i])
    plt.show()