In [1]:
import os
import logging
from pathlib import Path
from multiprocessing import cpu_count
import tensorflow as tf
from utils.dist_utils import is_sm_dist
from models import resnet, darknet, hrnet
from engine.schedulers import WarmupScheduler
from engine.optimizers import MomentumOptimizer
from datasets import create_dataset, parse
from engine.trainer import Trainer
if is_sm_dist():
    import smdistributed.dataparallel.tensorflow as dist
else:
    import horovod.tensorflow as dist
dist.init()

In [2]:
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import nvidia.dali.tfrecord as tfrec
import numpy as np
import nvidia.dali.plugin.tf as dali_tf

In [3]:
tf32 = True
xla = True
fp16 = True

In [4]:
#tf.config.threading.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
#tf.config.threading.inter_op_parallelism_threads = max(2, cpu_count()//dist.local_size()-2)
tf.config.optimizer.set_jit(xla)
#tf.config.optimizer.set_experimental_options({"auto_mixed_precision": fp16})
#tf.config.experimental.enable_tensor_float_32_execution(tf32)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], 'GPU')

In [5]:
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0


In [6]:
train_data_dir = '/home/ubuntu/data/imagenet/tfrecord/train/'
train_idx_dir = '/home/ubuntu/data/imagenet/tfrecord/trainidx/'
validation_data_dir = '/home/ubuntu/data/imagenet/tfrecord/validation/'
model_dir = '/home/ubuntu/models'
train_dataset_size = 1281167
num_classes = 1000
batch_size = 512
num_epochs = 125
schedule = 'cosine'
learning_rate = 0.01
momentum = 0.9
label_smoothing = 0.1
l2_weight_decay = 1e-5
mixup_alpha = 0.2
steps_per_epoch = train_dataset_size // (batch_size * dist.size())
iterations = steps_per_epoch * num_epochs

In [7]:
#list(Path(train_idx_dir).glob('*'))

In [8]:
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
_R_STD = 58.393
_G_STD = 57.12
_B_STD = 57.375

pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=dist.local_rank())

with pipe:
    inputs = fn.readers.tfrecord(
        path=sorted([i.as_posix() for i in Path(train_data_dir).glob('train*')]),
        index_path=sorted([i.as_posix() for i in Path(train_idx_dir).glob('*.idx')]),
        features={
            "image/encoded" : tfrec.FixedLenFeature((), tfrec.string, ""),
            "image/class/label": tfrec.FixedLenFeature([1], tfrec.int64,  -1)
        })
    jpegs = inputs["image/encoded"]
    images = fn.decoders.image(jpegs, device="mixed", output_type=types.RGB)
    resized = fn.resize(images, device="gpu", resize_shorter=256.)
    output = fn.crop_mirror_normalize(
        resized,
        dtype=types.FLOAT,
        crop=(224, 224),
        mean=[_R_MEAN, _G_MEAN, _B_MEAN],
        std=[_R_STD, _G_STD, _B_STD],
        output_layout='HWC')
    labels = fn.one_hot(inputs["image/class/label"].gpu(), dtype=types.FLOAT, num_classes=1000)
    pipe.set_outputs(output, labels)
    
shapes = (
    (batch_size, 224, 224, 3),
    (batch_size, 1000))
dtypes = (
    tf.float32,
    tf.float32)

def dataset_fn():
    with tf.device("/gpu:{}".format(dist.local_rank())):
        device_id = dist.local_rank()
        return dali_tf.DALIDataset(
            pipeline=pipe,
            batch_size=batch_size,
            output_shapes=shapes,
            output_dtypes=dtypes,
            device_id=device_id)

In [9]:
train_data_dali = dataset_fn()

In [10]:
#model = resnet.ResNet152V1_d(weights=None, weight_decay=l2_weight_decay, classes=num_classes)
model = resnet.ResNet50V1_b(weights=None, weight_decay=l2_weight_decay, classes=num_classes)
scheduler = tf.keras.experimental.CosineDecayRestarts(initial_learning_rate=learning_rate,
                    first_decay_steps=iterations, t_mul=1, m_mul=1)
scheduler = WarmupScheduler(scheduler=scheduler, initial_learning_rate=learning_rate / 10, warmup_steps=500)
opt = MomentumOptimizer(learning_rate=scheduler, momentum=momentum, nesterov=True) 
if fp16:
    opt = tf.keras.mixed_precision.LossScaleOptimizer(opt, dynamic=False, initial_scale=128)
    #opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt, loss_scale=128.)
loss_func = tf.keras.losses.CategoricalCrossentropy(from_logits=True, 
                                                    label_smoothing=label_smoothing, 
                                                    reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE) 

In [11]:
if dist.rank() == 0:
    path_logs = os.path.join(os.getcwd(), model_dir, 'log.csv')
    os.makedirs(model_dir, exist_ok=True)
    logging.basicConfig(filename=path_logs,
                            filemode='a',
                            format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.DEBUG)
    logging.info("Training Logs")
    logger = logging.getLogger('logger')
    # logger.info('Training options: %s', FLAGS)

# Barrier
_ = dist.allreduce(tf.constant(0))

In [12]:
train_data = create_dataset(train_data_dir, batch_size, preprocessing='resnet', train=True)
validation_data = create_dataset(validation_data_dir, batch_size, preprocessing='resnet', train=False)

In [13]:
train_iterator = iter(train_data)

In [14]:
images, labels = next(train_iterator)

In [15]:
trainer = Trainer(model, opt, loss_func, scheduler, logging=logger, fp16=fp16, mixup_alpha=mixup_alpha, model_dir='~/models/')

In [16]:
for epoch in range(num_epochs):
    trainer.train_epoch(train_data)

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
step: 0, step time: 0.9362, train_loss: 7.4534, top_1_accuracy: 0.0000, learning_rate: 0.0010
step: 50, step time: 0.5165, train_loss: 7.4317, top_1_accuracy: 0.0020, learning_rate: 0.0019
step: 100, step time: 0.2431, train_loss: 7.4339, top_1_accuracy: 0.0000, learning_rate: 0.0028
step: 150, step time: 0.2432, train_loss: 7.4172, top_1_accuracy: 0.0039, learning_rate: 0.0037
step: 200, step time: 0.2428, train_loss: 7.3969, top_1_accuracy: 0.0059, learning_rate: 0.0046
step: 250, step time: 0.2427, train_loss: 7.3805, t

KeyboardInterrupt: 

In [31]:
trainer.model.losses

[<tf.Tensor: shape=(), dtype=float32, numpy=0.012518508>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.012889975>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.025611505>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0012802852>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0012910651>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.005134337>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0050612004>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.001291969>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0012933591>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0051411>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0012700263>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0012805163>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0050955014>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0025513356>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0025572693>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.010240554>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.010225231>,
 <tf.Te

In [None]:
trainer.validation_epoch(validation_data, output_name='epoch_1')

In [1]:
from pathlib import Path

In [3]:
Path('/opt/ml/input/data/train').stem

'train'