# Subclass model api

In [1]:
import tensorflow as tf
import datetime
from common import create_dataset, SubclassModel
from custom_training import train_and_test

%load_ext tensorboard

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


2021-08-30 09:50:04.132592: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2021-08-30 09:50:04.145223: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2021-08-30 09:50:04.146384: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2021-08-30 09:50:04.148240: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the app

## データ取得

In [3]:
train_ds, valid_ds, test_ds = create_dataset(512, 0.2)

2021-08-30 09:50:06.312202: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 37632000 exceeds 10% of free system memory.
2021-08-30 09:50:06.405305: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 37632000 exceeds 10% of free system memory.


## 学習

In [4]:
model = SubclassModel()

In [5]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [6]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

2021-08-30 09:50:06.703700: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-08-30 09:50:06.703751: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2021-08-30 09:50:06.703799: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs
2021-08-30 09:50:06.704349: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-08-30 09:50:06.719168: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1666] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_NOT_INITIALIZED
2021-08-30 09:50:06.719421: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


In [7]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

In [8]:
# Define our metrics
train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('train_accuracy')
valid_loss = tf.keras.metrics.Mean('valid_loss', dtype=tf.float32)
valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('valid_accuracy')

In [9]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
valid_log_dir = 'logs/gradient_tape/' + current_time + '/valid'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
valid_summary_writer = tf.summary.create_file_writer(valid_log_dir)

In [10]:
train_and_test(model, loss_object, optimizer, train_ds, valid_ds, 20, train_loss, valid_loss, train_accuracy, valid_accuracy, train_summary_writer, valid_summary_writer)

2021-08-30 09:50:06.974695: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 37632000 exceeds 10% of free system memory.
2021-08-30 09:50:07.013549: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1, Loss: 0.7906481027603149, Accuracy: 0.7910833358764648, Test Loss: 0.353305459022522, Test Accuracy: 0.906333327293396


2021-08-30 09:50:12.398473: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 37632000 exceeds 10% of free system memory.


Epoch 2, Loss: 0.5594486594200134, Accuracy: 0.8508750200271606, Test Loss: 0.3159688413143158, Test Accuracy: 0.9141250252723694


2021-08-30 09:50:15.218924: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 37632000 exceeds 10% of free system memory.


Epoch 3, Loss: 0.4629693627357483, Accuracy: 0.8756735920906067, Test Loss: 0.2910187244415283, Test Accuracy: 0.9210000038146973
Epoch 4, Loss: 0.40600597858428955, Accuracy: 0.8903541564941406, Test Loss: 0.2726672887802124, Test Accuracy: 0.9257500171661377
Epoch 5, Loss: 0.3667554259300232, Accuracy: 0.9006583094596863, Test Loss: 0.2583067715167999, Test Accuracy: 0.9295833110809326
Epoch 6, Loss: 0.33719712495803833, Accuracy: 0.9084201455116272, Test Loss: 0.24706047773361206, Test Accuracy: 0.9325694441795349
Epoch 7, Loss: 0.31381210684776306, Accuracy: 0.9145714044570923, Test Loss: 0.236512690782547, Test Accuracy: 0.9351547360420227
Epoch 8, Loss: 0.2943815290927887, Accuracy: 0.919643223285675, Test Loss: 0.2275639772415161, Test Accuracy: 0.9373645782470703
Epoch 9, Loss: 0.27798768877983093, Accuracy: 0.9240139126777649, Test Loss: 0.2199205607175827, Test Accuracy: 0.9392129778862
Epoch 10, Loss: 0.26383641362190247, Accuracy: 0.9278125166893005, Test Loss: 0.2129170000

In [11]:
model.evaluate(test_ds)



[0.10797687619924545, 0.9660000205039978]

In [12]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 4037), started 0:29:03 ago. (Use '!kill 4037' to kill it.)