In [1]:
import os
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import time
import tensorflow_datasets as tfds
import numpy as np

# Load the Cityscapes dataset
dataset, info = tfds.load('cityscapes', split='train', with_info=True)
val_dataset = tfds.load('cityscapes', split='validation')
test_dataset = tfds.load('cityscapes', split='test')

# Define preprocessing functions

def preprocess_image(features):
    image = features['image_left']
    label = features['segmentation_label']
    image = tf.image.resize(image, (256, 512))
    label = tf.image.resize(label, (256, 512), method='nearest')
    image = tf.cast(image, tf.float32) / 255.0
    label = tf.cast(label, tf.int32)
    return image, label
    


# Preprocess the dataset
dataset = dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(2).prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.batch(2).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = test_dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(1).prefetch(tf.data.experimental.AUTOTUNE)

2024-07-30 20:32:28.957124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 20:32:28.978346: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 20:32:28.984913: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-30 20:32:29.000723: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1722364351.570256    1525 cuda_executor.c

In [2]:
# Define the U-Net model architecture
def unet(input_size=(256, 512, 3)):
    inputs = tf.keras.Input(input_size)
    
    # Encoder
    c1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    c1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(c1)
    p1 = layers.MaxPooling2D((2, 2))(c1)
    
    c2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(p1)
    c2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(c2)
    p2 = layers.MaxPooling2D((2, 2))(c2)
    
    c3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(p2)
    c3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(c3)
    p3 = layers.MaxPooling2D((2, 2))(c3)
    
    c4 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(p3)
    c4 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(c4)
    p4 = layers.MaxPooling2D((2, 2))(c4)
    
    # Bottleneck
    c5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(p4)
    c5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(c5)
    
    # Decoder
    u6 = layers.Conv2DTranspose(512, (2, 2), strides=(2, 2), padding='same')(c5)
    u6 = layers.concatenate([u6, c4])
    c6 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(u6)
    c6 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(c6)
    
    u7 = layers.Conv2DTranspose(256, (2, 2), strides=(2, 2), padding='same')(c6)
    u7 = layers.concatenate([u7, c3])
    c7 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(u7)
    c7 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(c7)
    
    u8 = layers.Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(c7)
    u8 = layers.concatenate([u8, c2])
    c8 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(u8)
    c8 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(c8)
    
    u9 = layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(c8)
    u9 = layers.concatenate([u9, c1])
    c9 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(u9)
    c9 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(c9)
    
    outputs = layers.Conv2D(34, (1, 1), activation='softmax')(c9)  # 34 classes in Cityscapes dataset
    
    model = models.Model(inputs=[inputs], outputs=[outputs])
    return model

# Create the U-Net model
model = unet(input_size=(256, 512, 3))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

In [3]:
tf.keras.backend.clear_session()
# Define callbacks
checkpoint_filepath = 'unet_best_model.weights.h5'
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

# Train the model and measure the time taken
start_time = time.time()
history = model.fit(dataset, epochs=10, validation_data=val_dataset, callbacks=[model_checkpoint_callback])
end_time = time.time()

# Print the time taken to train the model
print(f"Time taken to train the model: {end_time - start_time:.2f} seconds")

Epoch 1/10


I0000 00:00:1722347220.844548    1013 service.cc:146] XLA service 0x7f50b40096c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722347220.844622    1013 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
2024-07-30 15:48:00.460282: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
W0000 00:00:1722347281.424825    1013 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2024-07-30 15:48:02.572787: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
2024-07-30 15:48:27.602252: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.30GiB with freed_by_count=0. The caller indicates that this

[1m1487/1488[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 512ms/step - accuracy: 0.4731 - loss: 1.9349

W0000 00:00:1722348086.186057    1016 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2024-07-30 16:01:36.796169: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.15GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 527ms/step - accuracy: 0.4732 - loss: 1.9346

W0000 00:00:1722348112.164844    1015 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2024-07-30 16:01:53.732594: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.27GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-30 16:01:56.958370: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 32.52GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-30 16:01:59.168987: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.17GiB with freed_by_count=0. The caller i

[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m975s[0m 570ms/step - accuracy: 0.4733 - loss: 1.9343 - val_accuracy: 0.7023 - val_loss: 0.9996
Epoch 2/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m847s[0m 564ms/step - accuracy: 0.7138 - loss: 0.9696 - val_accuracy: 0.7463 - val_loss: 0.8746
Epoch 3/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m843s[0m 560ms/step - accuracy: 0.7535 - loss: 0.8609 - val_accuracy: 0.7751 - val_loss: 0.7780
Epoch 4/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m854s[0m 570ms/step - accuracy: 0.7902 - loss: 0.7483 - val_accuracy: 0.7869 - val_loss: 0.7400
Epoch 5/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m846s[0m 564ms/step - accuracy: 0.8103 - loss: 0.6872 - val_accuracy: 0.7991 - val_loss: 0.7054
Epoch 6/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m866s[0m 577ms/step - accuracy: 0.8231 - loss: 0.6461 - val_accuracy: 0.8213 - val_loss: 0.6323
Epo

In [4]:
tf.keras.backend.clear_session()
model.load_weights('unet_best_model.weights.h5')

dummy_batch = next(iter(test_dataset))
dummy_image = dummy_batch[0]
model.predict(dummy_image, verbose=0)  # Warm-up the model

total_time=0

for i in range(5):
    start_time = time.time()

    for batch in test_dataset:
        image = batch[0]
        predictions = model.predict(image, verbose=0)

    end_time = time.time()
    print("run", i+1)
    print(f"Inference Time: {end_time - start_time:.2f} seconds")
    print(f"Number of Images: {len(test_dataset)}")
    print(f"Average Inference Time per Image: {(end_time - start_time)/len(test_dataset)*1000:.0f}ms")
    print("----------------")
    total_time += end_time - start_time

print(f"Average Inference Time: {(end_time - start_time)/5:.2f} seconds")
print(f"Number of Images: {len(test_dataset)}")
print(f"Average Inference Time per Image: {total_time/5/len(test_dataset)*1000:.0f}ms")

  saveable.load_own_variables(weights_store.get(inner_path))
I0000 00:00:1722364436.411557    1602 service.cc:146] XLA service 0x7fee0c00d930 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722364436.411639    1602 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
2024-07-30 20:33:56.753705: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-30 20:33:57.596282: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
2024-07-30 20:34:05.036784: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 16.55GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-30 20:34:05.84

run 1
Inference Time: 233.99 seconds
Number of Images: 1525
Average Inference Time per Image: 153ms
----------------


2024-07-30 20:43:10.793958: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


run 2
Inference Time: 303.73 seconds
Number of Images: 1525
Average Inference Time per Image: 199ms
----------------
run 3
Inference Time: 293.18 seconds
Number of Images: 1525
Average Inference Time per Image: 192ms
----------------


2024-07-30 20:52:48.085173: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


run 4
Inference Time: 284.11 seconds
Number of Images: 1525
Average Inference Time per Image: 186ms
----------------
run 5
Inference Time: 329.91 seconds
Number of Images: 1525
Average Inference Time per Image: 216ms
----------------
Average Inference Time: 65.98 seconds
Number of Images: 1525
Average Inference Time per Image: 189ms
