In [1]:
from pathlib import Path
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm
from model.dali_pipe import dali_generator
from model.resnet import Resnet50
from model.lars import LARS
import horovod.tensorflow as hvd
import tensorflow_addons as tfa

In [2]:
hvd.init()

data_dir = Path('/workspace/shared_workspace/data/imagenet/')
index_dir = Path('/workspace/shared_workspace/data/imagenet_index/')
train_files = [i.as_posix() for i in data_dir.glob('*1024')]
train_index = [i.as_posix() for i in index_dir.glob('*1024')]

batch_size = 128
image_count = 1282048
steps_per_epoch = image_count//batch_size
learning_rate = 0.01*batch_size/256
scaled_rate = 3.7

tf.keras.backend.set_floatx('float16')
tf.keras.backend.set_epsilon(1e-4)
tf.config.optimizer.set_jit(True)

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
# mpirun -np 8 -H localhost:8 --bind-to none --allow-run-as-root python transfer_learning.py

In [4]:
scheduler = WarmupExponentialDecay(tf.cast(learning_rate, tf.float16), scaled_rate, steps_per_epoch, steps_per_epoch*10, 0.001)
train_tf = dali_generator(train_files, train_index, batch_size)
model = Resnet50()
optimizer = LARS(scheduler, use_nesterov=False, clip=False)
loss_func = tf.keras.losses.SparseCategoricalCrossentropy()

In [5]:
#@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        pred = model(images, training=True)
        loss = loss_func(labels, pred)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

In [1]:
for epoch in range(30):
    loss = []
    progressbar = tqdm(range(steps_per_epoch))
    for batch in progressbar:
        images, labels = next(train_tf)
        loss.append(train_step(images, labels).numpy())
        progressbar.set_description("train_loss: {0:.4f}".format(np.array(loss[-100:]).mean()))

'for epoch in range(30):\n    loss = []\n    progressbar = tqdm(range(steps_per_epoch))\n    for batch in progressbar:\n        images, labels = next(train_tf)\n        loss.append(train_step(images, labels).numpy())\n        progressbar.set_description("train_loss: {0:.4f}".format(np.array(loss[-100:]).mean()))'