## Environment setup

In [1]:
import glob, os, time
from datetime import datetime
from functools import partial
import importlib

import numpy as np

import tensorflow as tf

## Networks definitions and adaptations

In [2]:
from architecture_definitions import *
 
dir_postfix = get_classic_nn_dir_postfix() 
    
print_Classic_NN()  

Classic NN definitions:
dense_layer1_size: 128
dense_layer2_size: 64
dense_layer3_size: 48
dense_layer4_size: 1
------------------------


### Training data set preparation

In [3]:
import io_functions as io
importlib.reload(io)

batchSize = 4096
nEpochs = 1

#trainDataDir = "/scratch_ssd/akalinow/ProgrammingProjects/MachineLearning/OMTF/data/18_12_2020/"   
trainDataDir = "/home/kbunkow/cms_data/OMTF_data_2020/18_12_2020/"
trainFileNames = glob.glob(trainDataDir+'OMTFHits_pats0x0003_oldSample_files_*_chunk_0.tfrecord.gzip')

trainDataDir = "/scratch_cmsse/alibordi/data/training/"
trainFileNames = glob.glob(trainDataDir+'*tfrecord.gzip')

dataset = io.get_Classic_NN_dataset(batchSize, nEpochs, trainFileNames, isTrain=True)

print("dataset", dataset)

Reading data from files:
/scratch_cmsse/alibordi/data/training/SingleMu_OneOverPt_April4_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_iPtX_April4_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_iPtX_Feb22_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_OneOverPt_Feb22_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_iPtX_Feb15_chunk_0_filtered.tfrecord.gzip
/scratch_cmsse/alibordi/data/training/SingleMu_OneOverPt_Feb15_chunk_0_filtered.tfrecord.gzip


2023-04-19 09:19:41.203375: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 09:19:41.226840: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 09:19:41.227034: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 09:19:41.227699: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

dataset <ParallelMapDataset element_spec=(TensorSpec(shape=(4096, 37), dtype=tf.float16, name=None), (TensorSpec(shape=(4096,), dtype=tf.float16, name=None),), TensorSpec(shape=(4096,), dtype=tf.float16, name=None))>


### Model definition

In [8]:
import model_functions as models
importlib.reload(models)

import io_functions as io
importlib.reload(io)

networkInputSize = 2 * np.sum(io.getFeaturesMask()) + 1
loss_fn = 'mae'

model = models.get_Classic_NN(networkInputSize=networkInputSize, loss_fn=loss_fn)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 pt_layer_1 (Dense)          (None, 128)               4864      
                                                                 
 pt_layer_2 (Dense)          (None, 64)                8256      
                                                                 
 pt_layer_3 (Dense)          (None, 48)                3120      
                                                                 
 dense_1 (Dense)             (None, 1)                 49        
                                                                 
Total params: 16,289
Trainable params: 16,289
Non-trainable params: 0
_________________________________________________________________


### The training loop

In [5]:
from keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 2070 SUPER, compute capability 7.5
Compute dtype: float16
Variable dtype: float32


2023-04-19 09:21:26.233300: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [10]:
%%time
current_time = datetime.now().strftime("%Y_%b_%d_%H_%M_%S")
print("Training start. Current Time =", current_time)

nEpochs = 50

log_dir = "logs/fit/" + current_time + dir_postfix
job_dir = "training/" + current_time + dir_postfix

checkpoint_path = job_dir + "/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1,
                                                 save_freq = 5085)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=(10, 20))
early_stop_callback = tf.keras.callbacks.EarlyStopping(patience=5, verbose=1)
callbacks =  [tensorboard_callback, cp_callback, early_stop_callback]

model.save_weights(checkpoint_path.format(epoch=0))
   
model.fit(dataset.skip(10), 
          epochs=nEpochs, shuffle=True,
          callbacks=callbacks,
          validation_data = dataset.take(10)
            )
model.save(job_dir, save_format='tf')

current_time = datetime.now().strftime("%Y_%b_%d_%H_%M_%S")
print("Training end. Current Time =", current_time)

Training start. Current Time = 2023_Apr_19_09_31_47
Epoch 1/50


2023-04-19 09:31:47.792785: I tensorflow/core/profiler/lib/profiler_session.cc:99] Profiler session initializing.
2023-04-19 09:31:47.792808: I tensorflow/core/profiler/lib/profiler_session.cc:114] Profiler session started.
2023-04-19 09:31:47.879673: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session tear down.
2023-04-19 09:31:47.881408: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1799] CUPTI activity buffer flushed


     14/Unknown - 1s 24ms/step - loss: 8.5457

2023-04-19 09:31:48.373033: I tensorflow/core/profiler/lib/profiler_session.cc:99] Profiler session initializing.
2023-04-19 09:31:48.373056: I tensorflow/core/profiler/lib/profiler_session.cc:114] Profiler session started.


     18/Unknown - 1s 25ms/step - loss: 8.4266

2023-04-19 09:31:48.703752: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2023-04-19 09:31:48.711037: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1799] CUPTI activity buffer flushed
2023-04-19 09:31:49.055093: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:521]  GpuTracer has collected 2122 callback api events and 2101 activity events. 
2023-04-19 09:31:49.361172: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session tear down.
2023-04-19 09:31:50.036954: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: logs/fit/2023_Apr_19_09_31_47_classic_128_64_48_1/plugins/profile/2023_04_19_09_31_49

2023-04-19 09:31:50.786620: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to logs/fit/2023_Apr_19_09_31_47_classic_128_64_48_1/plugins/profile/2023_04_19_09_31_49/fba800be2533.trace.json.gz


     30/Unknown - 3s 105ms/step - loss: 8.3565

2023-04-19 09:31:51.081916: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: logs/fit/2023_Apr_19_09_31_47_classic_128_64_48_1/plugins/profile/2023_04_19_09_31_49

2023-04-19 09:31:51.087658: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for memory_profile.json.gz to logs/fit/2023_Apr_19_09_31_47_classic_128_64_48_1/plugins/profile/2023_04_19_09_31_49/fba800be2533.memory_profile.json.gz
2023-04-19 09:31:51.094186: I tensorflow/core/profiler/rpc/client/capture_profile.cc:251] Creating directory: logs/fit/2023_Apr_19_09_31_47_classic_128_64_48_1/plugins/profile/2023_04_19_09_31_49
Dumped tool data for xplane.pb to logs/fit/2023_Apr_19_09_31_47_classic_128_64_48_1/plugins/profile/2023_04_19_09_31_49/fba800be2533.xplane.pb
Dumped tool data for overview_page.pb to logs/fit/2023_Apr_19_09_31_47_classic_128_64_48_1/plugins/profile/2023_04_19_09_31_49/fba800be2533.overview_page.pb
Dumped tool data for input_pipeline.pb to logs

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
167/983 [====>.........................] - ETA: 17s - loss: 33.9803
Epoch 6: saving model to training/2023_Apr_19_09_31_47_classic_128_64_48_1/cp-0006.ckpt
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 11: saving model to training/2023_Apr_19_09_31_47_classic_128_64_48_1/cp-0011.ckpt
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 16: saving model to training/2023_Apr_19_09_31_47_classic_128_64_48_1/cp-0016.ckpt
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 21: saving model to training/2023_Apr_19_09_31_47_classic_128_64_48_1/cp-0021.ckpt
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 24: early stopping
INFO:tensorflow:Assets written to: training/2023_Apr_19_09_31_47_classic_128_64_48_1/assets
Training end. Current Time = 2023_Apr_19_09_40_15
CPU times: user 28min 57s, sys: 6min 58s, total: 35min 56s
Wall time: 8min 27s
