In [1]:
import os
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import time
import tensorflow_datasets as tfds
import numpy as np

# Load the Cityscapes dataset
dataset, info = tfds.load('cityscapes', split='train', with_info=True)
val_dataset = tfds.load('cityscapes', split='validation')
test_dataset = tfds.load('cityscapes', split='test')

# Define preprocessing functions

def preprocess_image(features):
    image = features['image_left']
    label = features['segmentation_label']
    image = tf.image.resize(image, (256, 512))
    label = tf.image.resize(label, (256, 512), method='nearest')
    image = tf.cast(image, tf.float32) / 255.0
    label = tf.cast(label, tf.int32)
    return image, label
    

# Preprocess the dataset
dataset = dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(2).prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.batch(2).prefetch(tf.data.experimental.AUTOTUNE)

2024-07-31 23:31:29.140131: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 23:31:29.817024: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 23:31:29.932913: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-31 23:31:30.715088: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1722461510.284955  100702 cuda_executor.c

In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models

def initial_block(input_tensor):
    x = layers.Conv2D(13, (3, 3), strides=2, padding='same')(input_tensor)
    x = layers.PReLU()(x)
    pool = layers.MaxPooling2D((2, 2), strides=2)(input_tensor)
    concat = layers.concatenate([x, pool], axis=-1)
    return concat

def bottleneck_block(input_tensor, filters, kernel_size=3, downsample=False, dilated=False, asym=False, dilation_rate=(1, 1)):
    stride = 2 if downsample else 1
    
    # Initial 1x1 Convolution
    x = layers.Conv2D(filters // 4, (1, 1), strides=stride, padding='same')(input_tensor)
    x = layers.BatchNormalization()(x)
    x = layers.PReLU()(x)
    
    # Main Convolution Path
    if dilated:
        x = layers.Conv2D(filters // 4, (kernel_size, kernel_size), padding='same', dilation_rate=dilation_rate)(x)
    elif asym:
        x = layers.Conv2D(filters // 4, (kernel_size, 1), padding='same')(x)
        x = layers.Conv2D(filters // 4, (1, kernel_size), padding='same')(x)
    else:
        x = layers.Conv2D(filters // 4, (kernel_size, kernel_size), padding='same')(x)
    
    x = layers.BatchNormalization()(x)
    x = layers.PReLU()(x)
    
    # Final 1x1 Convolution
    x = layers.Conv2D(filters, (1, 1), padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.SpatialDropout2D(0.01 if filters < 128 else 0.1)(x)
    
    # Adjust input_tensor if downsampling
    if downsample:
        input_tensor = layers.Conv2D(filters, (1, 1), strides=stride, padding='same')(input_tensor)
    
    # Align dimensions for addition
    if x.shape[1:] != input_tensor.shape[1:]:
        input_tensor = layers.Conv2D(filters, (1, 1), padding='same')(input_tensor)
    
    # Skip connection
    x = layers.add([x, input_tensor])
    x = layers.PReLU()(x)
    return x

def build_enet(input_shape, num_classes):
    input_tensor = layers.Input(shape=input_shape)
    
    x = initial_block(input_tensor)
    
    x = bottleneck_block(x, 64, downsample=True)
    for _ in range(4):
        x = bottleneck_block(x, 64)
    
    x = bottleneck_block(x, 128, downsample=True)
    for _ in range(2):
        x = bottleneck_block(x, 128)
        x = bottleneck_block(x, 128, dilated=True, dilation_rate=(2**_, 2**_))
        x = bottleneck_block(x, 128, asym=True)
    
    x = bottleneck_block(x, 64, downsample=False)
    for _ in range(2):
        x = bottleneck_block(x, 64)
    
    x = bottleneck_block(x, 16, downsample=False)
    x = bottleneck_block(x, 16)
    
    x = layers.Conv2D(num_classes, (1, 1), padding='same')(x)
    x = layers.Conv2DTranspose(num_classes, kernel_size=(4, 4), strides=(4, 4), padding='same')(x)
    x = layers.Conv2DTranspose(num_classes, kernel_size=(4, 4), strides=(2, 2), padding='same')(x)
    x = layers.Activation('softmax')(x)
    
    model = models.Model(inputs=input_tensor, outputs=x)
    return model

input_shape = (256, 512, 3)
num_classes = 34  # number of classes

model = build_enet(input_shape, num_classes)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [None]:
tf.keras.backend.clear_session()
# Define callbacks
checkpoint_filepath = 'enet_best_model.weights.h5'
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

# Train the model and measure the time taken
start_time = time.time()
history = model.fit(dataset, epochs=10, validation_data=val_dataset, callbacks=[model_checkpoint_callback])
end_time = time.time()

# Print the time taken to train the model
print(f"Time taken to train the model: {end_time - start_time:.2f} seconds")

Epoch 1/10


I0000 00:00:1722451803.346010    4119 service.cc:146] XLA service 0x7f83ae1c39e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722451803.346077    4119 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
2024-07-31 20:51:11.330101: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
W0000 00:00:1722451876.416774    4119 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2024-07-31 20:51:22.227362: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
I0000 00:00:1722451943.790918    4119 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1487/1488[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 160ms/step - accuracy: 0.6338 - loss: 1.2812

W0000 00:00:1722452198.622134    4118 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step - accuracy: 0.6338 - loss: 1.2810

W0000 00:00:1722452321.874400    4118 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 286ms/step - accuracy: 0.6339 - loss: 1.2809 - val_accuracy: 0.7641 - val_loss: 0.8492
Epoch 2/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 172ms/step - accuracy: 0.7813 - loss: 0.8038 - val_accuracy: 0.7690 - val_loss: 0.8219
Epoch 3/10
[1m 257/1488[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m8:28[0m 413ms/step - accuracy: 0.8066 - loss: 0.7105

In [5]:
test_dataset = test_dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(1).prefetch(tf.data.experimental.AUTOTUNE)

tf.keras.backend.clear_session()
model.load_weights('enet_best_model.weights.h5')

dummy_batch = next(iter(test_dataset))
dummy_image = dummy_batch[0]
model.predict(dummy_image, verbose=0)  # Warm-up the model

total_time=0

for i in range(5):
    start_time = time.time()

    for batch in test_dataset:
        image = batch[0]
        predictions = model.predict(image, verbose=0)

    end_time = time.time()
    print("run", i+1)
    print(f"Inference Time: {end_time - start_time:.2f} seconds")
    print(f"Number of Images: {len(test_dataset)}")
    print(f"Average Inference Time per Image: {(end_time - start_time)/len(test_dataset)*1000:.0f}ms")
    print("----------------")
    total_time += end_time - start_time

print(f"Average Inference Time: {(end_time - start_time)/5:.2f} seconds")
print(f"Number of Images: {len(test_dataset)}")
print(f"Average Inference Time per Image: {total_time/5/len(test_dataset)*1000:.0f}ms")

run 1
Inference Time: 230.37 seconds
Number of Images: 1525
Average Inference Time per Image: 151ms
----------------
run 2
Inference Time: 243.78 seconds
Number of Images: 1525
Average Inference Time per Image: 160ms
----------------


2024-08-01 00:06:06.342157: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


run 3
Inference Time: 286.64 seconds
Number of Images: 1525
Average Inference Time per Image: 188ms
----------------
run 4
Inference Time: 234.59 seconds
Number of Images: 1525
Average Inference Time per Image: 154ms
----------------
run 5
Inference Time: 242.33 seconds
Number of Images: 1525
Average Inference Time per Image: 159ms
----------------
Average Inference Time: 48.47 seconds
Number of Images: 1525
Average Inference Time per Image: 162ms
