In [32]:
from py_env_train import *

# Define the following for network configs:
loss = "mse"
#min_LR = 0.0001
lr_patience = 2
patience = 8
epochs = 64
val_split = 0.5
n_channels = 7
xpixels = 128
ypixels = 256

n_samples = 10
x = 128
y = 256
n_channels = 7
LR=0.1
BS=2
lr_factor=0.5
Filters=16
min_LR=None

Experiment 1) Let's start the experiment: we have train_x and train_y, which are exactly the same as val_x and val_y. 

In [33]:
import random
random.seed(1)
random_data = np.random.rand(n_samples, x, y, n_channels - 2)

channel1_data = np.ones((n_samples, x, y))
channel2_data = np.ones((n_samples, x, y)) * 2

train_x = np.concatenate((channel1_data[:, :, :, np.newaxis],
                          channel2_data[:, :, :, np.newaxis],
                          random_data), axis=-1)

train_y = np.full((n_samples, x, y, 1), 10, dtype=np.float32)

val_x = train_x*1
val_y = train_y*1

train_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
val_m = np.ones((n_samples, x, y, 1), dtype=np.float32)

training_unique_name="experiment_sample_weight"
model = Func_Train.UNET(xpixels, ypixels, n_channels, Filters)

# Define optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, name='Adam')
#model.compile(optimizer=optimizer, loss=loss, weighted_metrics=['mse'], sample_weight_mode='temporal')
model.compile(optimizer=optimizer, loss=loss, weighted_metrics=[])

# Define the model checkpoint and early stopping callbacks
model_path = PSCRATCH_DIR + '/HPT/' + training_unique_name + '.h5'
checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path, verbose=2, save_best_only=True, monitor='val_loss')
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.TensorBoard(log_dir=PSCRATCH_DIR + '/HPT/' + training_unique_name)]

# Define the ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=lr_factor, patience=lr_patience, min_lr=min_LR)

print("Training the model...")

# Train the model using train_x, train_y, train_m and val_x, val_y, val_m
results = model.fit(train_x, train_y, validation_data=(val_x, val_y, val_m),
                    batch_size=BS, epochs=epochs, verbose=2,
                    callbacks=[callbacks, checkpointer, reduce_lr], sample_weight=train_m,
                    shuffle=False)

Training the model...
Epoch 1/64

Epoch 1: val_loss improved from inf to 3524599421801070592.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 8s - loss: 81.4837 - val_loss: 3524599421801070592.0000 - lr: 0.1000 - 8s/epoch - 2s/step
Epoch 2/64

Epoch 2: val_loss improved from 3524599421801070592.00000 to 3300407902208.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 13.9630 - val_loss: 3300407902208.0000 - lr: 0.1000 - 1s/epoch - 246ms/step
Epoch 3/64

Epoch 3: val_loss improved from 3300407902208.00000 to 637459968.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 14.8164 - val_loss: 637459968.0000 - lr: 0.1000 - 1s/epoch - 295ms/step
Epoch 4/64

Epoch 4: val_loss improved from 637459968.00000 to 414714.87500, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5

Although train_x = val_x and train_y = val_y, the results of loss and val_loss differ! which is surprising. This is because loss is calculated for the same epoch, whereas val_loss is calculated for the whole validation data.

Experiment 2) Now, let's assume one of the data in `train_y` and `val_y` is nan, but we have replaced it with -999. In `train_m`, we have assigned 0 values for this sample to not include it in the training, but we haven't included it in `val_m`. 

We expect that the loss values will not be affected, but val_loss values must be affected. 

In [35]:
import random
random.seed(1)
random_data = np.random.rand(n_samples, x, y, n_channels - 2)

channel1_data = np.ones((n_samples, x, y))
channel2_data = np.ones((n_samples, x, y)) * 2

train_x = np.concatenate((channel1_data[:, :, :, np.newaxis],
                          channel2_data[:, :, :, np.newaxis],
                          random_data), axis=-1)

train_y = np.full((n_samples, x, y, 1), 10, dtype=np.float32)
train_y[5, ...] = -999.

val_x = train_x*1
val_y = train_y*1

train_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
train_m[5, ...] = 0
val_m = np.ones((n_samples, x, y, 1), dtype=np.float32)


training_unique_name="experiment_sample_weight"
model = Func_Train.UNET(xpixels, ypixels, n_channels, Filters)

# Define optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, name='Adam')
#model.compile(optimizer=optimizer, loss=loss, weighted_metrics=['mse'], sample_weight_mode='temporal')
model.compile(optimizer=optimizer, loss=loss, weighted_metrics=[])

# Define the model checkpoint and early stopping callbacks
model_path = PSCRATCH_DIR + '/HPT/' + training_unique_name + '.h5'
checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path, verbose=2, save_best_only=True, monitor='val_loss')
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.TensorBoard(log_dir=PSCRATCH_DIR + '/HPT/' + training_unique_name)]

# Define the ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=lr_factor, patience=lr_patience, min_lr=min_LR)

print("Training the model...")

# Train the model using train_x, train_y, train_m and val_x, val_y, val_m
results = model.fit(train_x, train_y, validation_data=(val_x, val_y, val_m),
                    batch_size=BS, epochs=epochs, verbose=2,
                    callbacks=[callbacks, checkpointer, reduce_lr], sample_weight=train_m,
                    shuffle=False)

Training the model...
Epoch 1/64

Epoch 1: val_loss improved from inf to 7648386492661760.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 7s - loss: 73.4162 - val_loss: 7648386492661760.0000 - lr: 0.1000 - 7s/epoch - 1s/step
Epoch 2/64

Epoch 2: val_loss improved from 7648386492661760.00000 to 251752267776.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 14.2294 - val_loss: 251752267776.0000 - lr: 0.1000 - 1s/epoch - 265ms/step
Epoch 3/64

Epoch 3: val_loss improved from 251752267776.00000 to 31888140288.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 11.8853 - val_loss: 31888140288.0000 - lr: 0.1000 - 1s/epoch - 260ms/step
Epoch 4/64

Epoch 4: val_loss improved from 31888140288.00000 to 134961056.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 

The results from Experiment 2 were exactly as expected. 

Experiment 3) Now, let's apply the same in `val_m`.

In [36]:
import random
random.seed(1)
random_data = np.random.rand(n_samples, x, y, n_channels - 2)

channel1_data = np.ones((n_samples, x, y))
channel2_data = np.ones((n_samples, x, y)) * 2

train_x = np.concatenate((channel1_data[:, :, :, np.newaxis],
                          channel2_data[:, :, :, np.newaxis],
                          random_data), axis=-1)

train_y = np.full((n_samples, x, y, 1), 10, dtype=np.float32)
train_y[5, ...] = -999.

val_x = train_x*1
val_y = train_y*1

train_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
train_m[5, ...] = 0
val_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
val_m[5, ...] = 0

training_unique_name="experiment_sample_weight"
model = Func_Train.UNET(xpixels, ypixels, n_channels, Filters)

# Define optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, name='Adam')
#model.compile(optimizer=optimizer, loss=loss, weighted_metrics=['mse'], sample_weight_mode='temporal')
model.compile(optimizer=optimizer, loss=loss, weighted_metrics=[])

# Define the model checkpoint and early stopping callbacks
model_path = PSCRATCH_DIR + '/HPT/' + training_unique_name + '.h5'
checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path, verbose=2, save_best_only=True, monitor='val_loss')
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.TensorBoard(log_dir=PSCRATCH_DIR + '/HPT/' + training_unique_name)]

# Define the ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=lr_factor, patience=lr_patience, min_lr=min_LR)

print("Training the model...")

# Train the model using train_x, train_y, train_m and val_x, val_y, val_m
results = model.fit(train_x, train_y, validation_data=(val_x, val_y, val_m),
                    batch_size=BS, epochs=epochs, verbose=2,
                    callbacks=[callbacks, checkpointer, reduce_lr], sample_weight=train_m,
                    shuffle=False)

Training the model...
Epoch 1/64

Epoch 1: val_loss improved from inf to 99659389337600.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 7s - loss: 73.2670 - val_loss: 99659389337600.0000 - lr: 0.1000 - 7s/epoch - 1s/step
Epoch 2/64

Epoch 2: val_loss improved from 99659389337600.00000 to 18863952.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 13.9240 - val_loss: 18863952.0000 - lr: 0.1000 - 1s/epoch - 296ms/step
Epoch 3/64

Epoch 3: val_loss improved from 18863952.00000 to 10351.16406, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 11.8845 - val_loss: 10351.1641 - lr: 0.1000 - 1s/epoch - 259ms/step
Epoch 4/64

Epoch 4: val_loss did not improve from 10351.16406
5/5 - 1s - loss: 1.8368 - val_loss: 18751.5781 - lr: 0.1000 - 1s/epoch - 207ms/step
Epoch 5/64

Epoch 5: val_loss improved from 10351.16406 to 188

The results from Experiment 3 were exactly as expected. 

Experiment 4) Now, let's apply the same na problem, but this time, variable in time and space. Certain (t, x, y)s are not available, and we want to neither train nor validate on them. We want to test whether sample_weights is able to determine those. 

In [37]:
import random
random.seed(1)
random_data = np.random.rand(n_samples, x, y, n_channels - 2)

channel1_data = np.ones((n_samples, x, y))
channel2_data = np.ones((n_samples, x, y)) * 2

train_x = np.concatenate((channel1_data[:, :, :, np.newaxis],
                          channel2_data[:, :, :, np.newaxis],
                          random_data), axis=-1)

train_y = np.full((n_samples, x, y, 1), 10, dtype=np.float32)

train_y[5, 10:30, 25:150, 0] = -999.
train_y[1, 0:30, 50:180, 0] = -999.
train_y[:, 50:60, :, 0] = -999.

val_x = train_x*1
val_y = train_y*1

train_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
train_m[train_y == -999.] = 0
val_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
val_m[val_y == -999.] = 0

training_unique_name="experiment_sample_weight"
model = Func_Train.UNET(xpixels, ypixels, n_channels, Filters)

# Define optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, name='Adam')
#model.compile(optimizer=optimizer, loss=loss, weighted_metrics=['mse'], sample_weight_mode='temporal')
model.compile(optimizer=optimizer, loss=loss, weighted_metrics=[])

# Define the model checkpoint and early stopping callbacks
model_path = PSCRATCH_DIR + '/HPT/' + training_unique_name + '.h5'
checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path, verbose=2, save_best_only=True, monitor='val_loss')
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.TensorBoard(log_dir=PSCRATCH_DIR + '/HPT/' + training_unique_name)]

# Define the ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=lr_factor, patience=lr_patience, min_lr=min_LR)

print("Training the model...")

# Train the model using train_x, train_y, train_m and val_x, val_y, val_m
results = model.fit(train_x, train_y, validation_data=(val_x, val_y, val_m),
                    batch_size=BS, epochs=epochs, verbose=2,
                    callbacks=[callbacks, checkpointer, reduce_lr], sample_weight=train_m,
                    shuffle=False)

Training the model...
Epoch 1/64

Epoch 1: val_loss improved from inf to 220793950320489136128.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 7s - loss: 77.2956 - val_loss: 220793950320489136128.0000 - lr: 0.1000 - 7s/epoch - 1s/step
Epoch 2/64

Epoch 2: val_loss improved from 220793950320489136128.00000 to 824738544877568.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 18.2377 - val_loss: 824738544877568.0000 - lr: 0.1000 - 1s/epoch - 255ms/step
Epoch 3/64

Epoch 3: val_loss improved from 824738544877568.00000 to 1538992111616.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/experiment_sample_weight.h5
5/5 - 1s - loss: 13.2547 - val_loss: 1538992111616.0000 - lr: 0.1000 - 1s/epoch - 274ms/step
Epoch 4/64

Epoch 4: val_loss improved from 1538992111616.00000 to 7288320000.00000, saving model to /p/scratch/deepacf/kiste/patakchiyousefi1//HPT/exp

So far, so good.

Experiment 5) Now, let's add a 'mse' metric and see whether it incorporates weights in the calculations of training and validation mse

In [None]:
import random
random.seed(1)
random_data = np.random.rand(n_samples, x, y, n_channels - 2)

channel1_data = np.ones((n_samples, x, y))
channel2_data = np.ones((n_samples, x, y)) * 2

train_x = np.concatenate((channel1_data[:, :, :, np.newaxis],
                          channel2_data[:, :, :, np.newaxis],
                          random_data), axis=-1)

train_y = np.full((n_samples, x, y, 1), 10, dtype=np.float32)

train_y[5, 10:30, 25:150, 0] = -999.
train_y[1, 0:30, 50:180, 0] = -999.
train_y[:, 50:60, :, 0] = -999.

val_x = train_x*1
val_y = train_y*1

train_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
train_m[train_y == -999.] = 0
val_m = np.ones((n_samples, x, y, 1), dtype=np.float32)
val_m[val_y == -999.] = 0

training_unique_name="experiment_sample_weight"
model = Func_Train.UNET(xpixels, ypixels, n_channels, Filters)

# Define optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, name='Adam')
model.compile(optimizer=optimizer, loss=loss, weighted_metrics=['mse'])
#model.compile(optimizer=optimizer, loss=loss, weighted_metrics=[], , sample_weight_mode='temporal)

# Define the model checkpoint and early stopping callbacks
model_path = PSCRATCH_DIR + '/HPT/' + training_unique_name + '.h5'
checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path, verbose=2, save_best_only=True, monitor='val_loss')
callbacks = [tf.keras.callbacks.EarlyStopping(patience=patience, monitor='val_loss'),
             tf.keras.callbacks.TensorBoard(log_dir=PSCRATCH_DIR + '/HPT/' + training_unique_name)]

# Define the ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=lr_factor, patience=lr_patience, min_lr=min_LR)

print("Training the model...")

# Train the model using train_x, train_y, train_m and val_x, val_y, val_m
results = model.fit(train_x, train_y, validation_data=(val_x, val_y, val_m),
                    batch_size=BS, epochs=epochs, verbose=2,
                    callbacks=[callbacks, checkpointer, reduce_lr], sample_weight=train_m,
                    shuffle=False)