In [2]:
from py_env_train import *

# Define the data specifications:
model_data = ["HRES"]
reference_data = ["HSAF"]
task_name = "spatiotemporal"
mm = "MM"  # or DM
date_start="2020-07-01T13"
date_end="2023-03-26T23"
variable = "pr"
mask_type = "no_na"
laginensemble = False

# Define the following for network configs:
loss="mse"
Filters=32
#LR=0.01
min_LR=0.0001
#BS=2
lr_patience=2
patience=8
#lr_factor=0.25
epochs=64
val_split=0.2
n_channels=7
xpixels=128
ypixels=256

######################################################################################################################################################

filename = Func_Train.data_unique_name_generator(model_data, reference_data, task_name, mm, date_start, date_end, variable, mask_type, laginensemble)
data_unique_name=filename[:-4]

training_unique_name = Func_Train.generate_training_unique_name(loss, Filters, LR, min_LR, lr_factor, lr_patience, BS, patience, val_split, epochs)

print(data_unique_name, training_unique_name)

# Create the training data (if doesn't exist)
Func_Train.prepare_train(PPROJECT_DIR, TRAIN_FILES, ATMOS_DATA, filename, model_data, reference_data, task_name, mm, date_start, date_end, variable, mask_type, laginensemble, val_split, training_unique_name)

# load the training data
print("Loading training data...")
train_files=np.load(TRAIN_FILES+"/"+filename)

train_x=train_files["train_x"]
train_y=train_files["train_y"]
train_m=train_files["train_m"]
val_x=train_files["val_x"]
val_y=train_files["val_y"]
val_m=train_files["val_m"]

model = Func_Train.UNET(xpixels, ypixels, n_channels, Filters)

# Define optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, name='Adam')
model.compile(optimizer=optimizer, loss=loss, metrics=['mse'])

# Define the model checkpoint and early stopping callbacks
model_path = PPROJECT_DIR+'/AI MODELS/00-UNET/'+training_unique_name+'.h5'
checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path, verbose=2, save_best_only=True, monitor='val_loss')
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.TensorBoard(log_dir=PPROJECT_DIR+'/AI MODELS/00-UNET/'+training_unique_name)]

# Define the ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=lr_factor, patience=lr_patience, min_lr=min_LR)

print("Training the model...")
# Train the model using train_x, train_y, train_m and val_x, val_y, val_m
results = model.fit(train_x, train_y, validation_data=(val_x, val_y, val_m), 
                    batch_size=BS, epochs=epochs, verbose=1, 
                    callbacks=[callbacks, checkpointer, reduce_lr], sample_weight=train_m)
# Save and plot the results
print("Saving and plotting the results...")
RESULTS_DF=pd.DataFrame(results.history)
RESULTS_DF.to_csv(DUMP_RESULTS+"/"+training_unique_name+".csv")

train_data_hourly.pr.['HRES'].['HSAF'].MM.6..spatiotemporal.128.256.2020-07-01T13.2023-03-26T23.no_na mse.32.0.01.0.0001.0.25.2.2.8.0.1.64
Loading training data...


2023-06-24 15:37:50.688776: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-06-24 15:37:50.874391: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:61:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.74GiB deviceMemoryBandwidth: 836.37GiB/s
2023-06-24 15:37:50.874862: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 1 with properties: 
pciBusID: 0000:62:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.74GiB deviceMemoryBandwidth: 836.37GiB/s
2023-06-24 15:37:50.875277: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 2 with properties: 
pciBusID: 0000:89:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.74GiB deviceMemoryBandwidth: 836.37GiB/s
2

Training the model...


2023-06-24 15:38:08.302138: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-06-24 15:38:08.314483: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2600000000 Hz


Epoch 1/64


2023-06-24 15:38:10.200626: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2023-06-24 15:38:12.377582: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8201
2023-06-24 15:38:16.870000: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-06-24 15:38:19.633890: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11


    1/10781 [..............................] - ETA: 40:12:11 - loss: 1.3392 - mse: 533413.3750

2023-06-24 15:38:22.182656: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2023-06-24 15:38:22.182695: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.


    3/10781 [..............................] - ETA: 1:46:28 - loss: 5.0166 - mse: 298184.5312

2023-06-24 15:38:22.885858: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2023-06-24 15:38:22.886246: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed
2023-06-24 15:38:22.974730: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 521 callback api events and 518 activity events. 
2023-06-24 15:38:22.983767: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2023-06-24 15:38:23.009159: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /p/project/deepacf/kiste/patakchiyousefi1/AI MODELS/00-UNET/mse.32.0.01.0.0001.0.25.2.2.8.0.1.64/train/plugins/profile/2023_06_24_15_38_22
2023-06-24 15:38:23.016573: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to /p/project/deepacf/kiste/patakchiyousefi1/AI MODELS/00-UNET/mse.32.0.01.0.0001.0.25.2.2.8.0.1.64/train/pl


Epoch 00001: val_loss improved from inf to 0.12002, saving model to /p/project/deepacf/kiste/patakchiyousefi1/AI MODELS/00-UNET/mse.32.0.01.0.0001.0.25.2.2.8.0.1.64.h5
Epoch 2/64

Epoch 00002: val_loss improved from 0.12002 to 0.11628, saving model to /p/project/deepacf/kiste/patakchiyousefi1/AI MODELS/00-UNET/mse.32.0.01.0.0001.0.25.2.2.8.0.1.64.h5
Epoch 3/64

Epoch 00003: val_loss improved from 0.11628 to 0.11518, saving model to /p/project/deepacf/kiste/patakchiyousefi1/AI MODELS/00-UNET/mse.32.0.01.0.0001.0.25.2.2.8.0.1.64.h5
Epoch 4/64

Epoch 00004: val_loss did not improve from 0.11518
Epoch 5/64

Epoch 00005: val_loss did not improve from 0.11518
Epoch 6/64

Epoch 00006: val_loss did not improve from 0.11518
Epoch 7/64

Epoch 00007: val_loss did not improve from 0.11518
Epoch 8/64

Epoch 00008: val_loss did not improve from 0.11518
Epoch 9/64

Epoch 00009: val_loss did not improve from 0.11518
Epoch 10/64

Epoch 00010: val_loss improved from 0.11518 to 0.11494, saving model to 