## Environment setup

In [1]:
import glob, os, time
from datetime import datetime
import importlib

import numpy as np

import tensorflow as tf

## Networks definitions and adaptations

In [2]:
from architecture_definitions import *

oneOverPt = False 
lut_nn = True
output_type = 0
last_input_is_bias = True

if output_type == 1:
    layer3_neurons = 3
    loss_fn = custom_loss3
else: 
    output_cnt = 1
    layer3_neurons = 1
    loss_fn = 'mae'
        
if not last_input_is_bias:
    networkInputSize =  nLayers
    layer2_lutRangesCnt = 1
    layer2_input_offset = None 
 
dir_postfix = get_lut_nn_dir_postfix() 
    
print_LUT_NN()  

LUT NN definitions:
layer1_lut_size 1024
layer2_lut_size 256
layer3_lut_size 32
layer2_lutRangesCnt 16
layer2_input_offset 8.0
------------------------


### Training data set preparation

In [3]:
import io_functions as io
importlib.reload(io)

batchSize = 4096
nEpochs = 1

#trainDataDir = "/scratch_ssd/akalinow/ProgrammingProjects/MachineLearning/OMTF/data/18_12_2020/"   
trainDataDir = "/home/kbunkow/cms_data/OMTF_data_2020/18_12_2020/"
trainFileNames = glob.glob(trainDataDir+'OMTFHits_pats0x0003_oldSample_files_*_chunk_0.tfrecord.gzip')

trainDataDir = "/scratch_cmsse/alibordi/data/training/"
#trainDataDir = "/home/kbunkow/cms_data/SingleMu/TFRecord"
#trainDataDir = "/eos/user/a/akalinow/Data/SingleMu/TFRecord/"
trainFileNames = glob.glob(trainDataDir+'*tfrecord.gzip')

dataset = io.get_LUT_NN_dataset(batchSize, nEpochs, trainFileNames, 
                                nRefLayers=nRefLayers,
                                layer1_lut_size=layer1_lut_size,
                                layer2_lut_size=layer2_lut_size,
                                layer2_lutRangesCnt=layer2_lutRangesCnt,
                                last_input_is_bias=last_input_is_bias,
                                rangeFactor=rangeFactor,
                                isTrain=True)

Reading data from files:
/scratch_cmsse/alibordi/data/training/SingleMu_OneOverPt_April4_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_iPtX_April4_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_iPtX_Feb15_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_OneOverPt_Feb15_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_OneOverPt_Feb22_chunk_0_filtered.tfrecord.gzip


2023-04-19 11:02:56.064247: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 11:02:56.070954: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 11:02:56.071158: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 11:02:56.071840: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

### Model definition

In [6]:
import model_functions as models
importlib.reload(models)

model = models.get_LUT_NN(last_input_is_bias=last_input_is_bias, loss_fn=loss_fn)
model.summary()

constructing LutInterLayer  layer1 lut_size 1024 num_inputs 19 num_outputs 16 input_offset 0 self.input_offset 0 last_input_is_bias True
write_lut_hist False hist_writer None
constructing LutInterLayer  layer2 lut_size 256 num_inputs 16 num_outputs 8 input_offset 8.0 self.input_offset 8.0 last_input_is_bias False
write_lut_hist False hist_writer None
constructing LutInterLayer  layer3 lut_size 32 num_inputs 8 num_outputs 1 input_offset None self.input_offset 15.5 last_input_is_bias False
write_lut_hist False hist_writer None
layer1 
LutInterLayer.build: luts_float: layer1.luts_float:0 shape (18, 1024, 16)
layer2 
LutInterLayer.build: luts_float: layer2.luts_float:0 shape (16, 256, 8)
layer3 
LutInterLayer.build: luts_float: layer3.luts_float:0 shape (8, 32, 1)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 layer1 (LutInterLayer)      (None, 16)                313344    
           

### The training loop

In [None]:
%%time
current_time = datetime.now().strftime("%Y_%b_%d_%H_%M_%S")
print("Training start. Current Time =", current_time)

nEpochs = 50

log_dir = "logs/fit/" + current_time + dir_postfix
job_dir = "training/" + current_time + dir_postfix

checkpoint_path = job_dir + "/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1,
                                                 save_freq = 5085)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=(10, 20))
early_stop_callback = tf.keras.callbacks.EarlyStopping(patience=5, verbose=1)
callbacks =  [tensorboard_callback, cp_callback, early_stop_callback]

model.save_weights(checkpoint_path.format(epoch=0))
   
model.fit(dataset.skip(10), 
          epochs=nEpochs, shuffle=True,
          callbacks=[tensorboard_callback, cp_callback],
          validation_data = dataset.take(10)
            )
model.save(job_dir, save_format='tf')

current_time = datetime.now().strftime("%Y_%b_%d_%H_%M_%S")
print("Training end. Current Time =", current_time)

Training start. Current Time = 2023_Apr_19_11_06_06


2023-04-19 11:06:06.256497: I tensorflow/core/profiler/lib/profiler_session.cc:99] Profiler session initializing.
2023-04-19 11:06:06.256521: I tensorflow/core/profiler/lib/profiler_session.cc:114] Profiler session started.
2023-04-19 11:06:06.256548: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1665] Profiler found 1 GPUs
2023-04-19 11:06:06.397810: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session tear down.
2023-04-19 11:06:06.399521: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1799] CUPTI activity buffer flushed


Epoch 1/50
     14/Unknown - 2s 24ms/step - loss: 26.3638

2023-04-19 11:06:07.920759: I tensorflow/core/profiler/lib/profiler_session.cc:99] Profiler session initializing.
2023-04-19 11:06:07.920787: I tensorflow/core/profiler/lib/profiler_session.cc:114] Profiler session started.


     18/Unknown - 2s 26ms/step - loss: 25.7740

2023-04-19 11:06:08.281547: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2023-04-19 11:06:08.290126: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1799] CUPTI activity buffer flushed
2023-04-19 11:06:08.597746: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:521]  GpuTracer has collected 1638 callback api events and 1617 activity events. 
2023-04-19 11:06:08.826227: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session tear down.
2023-04-19 11:06:09.457334: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/profile/2023_04_19_11_06_08

2023-04-19 11:06:10.157641: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/profile/2023_04_19_11_06_08/fba800be2533.trace.json.gz


     28/Unknown - 4s 103ms/step - loss: 24.7804

2023-04-19 11:06:10.407468: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/profile/2023_04_19_11_06_08

2023-04-19 11:06:10.413556: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for memory_profile.json.gz to logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/profile/2023_04_19_11_06_08/fba800be2533.memory_profile.json.gz
2023-04-19 11:06:10.419421: I tensorflow/core/profiler/rpc/client/capture_profile.cc:251] Creating directory: logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/profile/2023_04_19_11_06_08
Dumped tool data for xplane.pb to logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/profile/2023_04_19_11_06_08/fba800be2533.xplane.pb
Dumped tool data for overview_page.pb to logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/profile/2023_04_19_11_06_08/fba800be2533.overview_page.pb
Dumped tool data for input_pipeline.pb to logs/fit/2023_Apr_19_11_06_06_lut_16_8_1/plugins/

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 7: saving model to training/2023_Apr_19_11_06_06_lut_16_8_1/cp-0007.ckpt
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
157/770 [=====>........................] - ETA: 12s - loss: 28.7118
Epoch 14: saving model to training/2023_Apr_19_11_06_06_lut_16_8_1/cp-0014.ckpt
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 20: saving model to training/2023_Apr_19_11_06_06_lut_16_8_1/cp-0020.ckpt
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 27: saving model to training/2023_Apr_19_11_06_06_lut_16_8_1/cp-0027.ckpt
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
 13/770 [..............................] - ETA: 16s - loss: 5.3231
Epoch 34: saving model to training/2023_Apr_19_11_06_06_lut_16_8_1/cp-0034.ckpt
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 