In [1]:
import os
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import time
import tensorflow_datasets as tfds
import numpy as np

# Load the Cityscapes dataset
dataset, info = tfds.load('cityscapes', split='train', with_info=True)
val_dataset = tfds.load('cityscapes', split='validation')
test_dataset = tfds.load('cityscapes', split='test')

# Define preprocessing functions

def preprocess_image(features):
    image = features['image_left']
    label = features['segmentation_label']
    image = tf.image.resize(image, (256, 512))
    label = tf.image.resize(label, (256, 512), method='nearest')
    image = tf.cast(image, tf.float32) / 255.0
    label = tf.cast(label, tf.int32)
    return image, label
    


# Preprocess the dataset
dataset = dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(2).prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.batch(2).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = test_dataset.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(1).prefetch(tf.data.experimental.AUTOTUNE)

2024-07-31 23:04:37.371866: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 23:04:37.940456: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 23:04:38.047982: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-31 23:04:38.830516: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1722459897.276232    2208 cuda_executor.c

In [2]:
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, ReLU, GlobalAveragePooling2D, Dense, Add, UpSampling2D, concatenate
from tensorflow.keras.models import Model

num_classes = 34  # Adjusted to match your model's output


# Define the model architecture
def SpatialPath(input_tensor):
    x = Conv2D(64, (7, 7), strides=2, padding='same')(input_tensor)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(128, (3, 3), strides=2, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(256, (3, 3), strides=2, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    return x

def AttentionRefinementModule(input_tensor):
    x = GlobalAveragePooling2D()(input_tensor)
    x = Dense(input_tensor.shape[-1], activation='sigmoid')(x)
    x = tf.keras.layers.Multiply()([input_tensor, x[:, tf.newaxis, tf.newaxis, :]])
    return x

def ContextPath(input_tensor):
    base_model = tf.keras.applications.Xception(include_top=False, weights='imagenet', input_tensor=input_tensor)
    feature_13 = base_model.get_layer('block13_sepconv2_bn').output
    feature_14 = base_model.get_layer('block14_sepconv2_act').output
    feature_13_arm = AttentionRefinementModule(feature_13)
    feature_14_arm = AttentionRefinementModule(feature_14)
    global_context = GlobalAveragePooling2D()(feature_14_arm)
    global_context = tf.keras.layers.Reshape((1, 1, -1))(global_context)
    global_context = tf.keras.layers.UpSampling2D(size=(tf.keras.backend.int_shape(feature_14_arm)[1], tf.keras.backend.int_shape(feature_14_arm)[2]), interpolation='nearest')(global_context)
    feature_14_arm = tf.keras.layers.Add()([feature_14_arm, global_context])
    feature_13_arm = UpSampling2D(size=(2, 2), interpolation='bilinear')(feature_13_arm)
    feature_14_arm = UpSampling2D(size=(4, 4), interpolation='bilinear')(feature_14_arm)
    return feature_13_arm, feature_14_arm

def FeatureFusionModule(spatial_out, context_out):
    concatenated = concatenate([spatial_out, context_out])
    x = Conv2D(256, (3, 3), padding='same')(concatenated)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    pooled = GlobalAveragePooling2D()(x)
    pooled = Dense(256, activation='relu')(pooled)
    pooled = Dense(256, activation='sigmoid')(pooled)
    pooled = tf.keras.layers.Reshape((1, 1, 256))(pooled)
    x = tf.keras.layers.Multiply()([x, pooled])
    return x

# Input layer
input_tensor = Input(shape=(256, 512, 3))

# Spatial Path
spatial_out = SpatialPath(input_tensor)

# Context Path
context_out_13, context_out_14 = ContextPath(input_tensor)

# Feature Fusion Module
fused_out = FeatureFusionModule(spatial_out, context_out_14)

# Upsample the final output
fused_out = UpSampling2D(size=(8, 8), interpolation='bilinear')(fused_out)

# Final classifier
output_tensor = Conv2D(34, (1, 1), activation='softmax')(fused_out)

# Create the model
model = Model(input_tensor, output_tensor)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

In [3]:
tf.keras.backend.clear_session()
# Define callbacks
checkpoint_filepath = 'bisenet_best_model.weights.h5'
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

# Train the model and measure the time taken
start_time = time.time()
history = model.fit(dataset, epochs=10, validation_data=val_dataset, callbacks=[model_checkpoint_callback])
end_time = time.time()

# Print the time taken to train the model
print(f"Time taken to train the model: {end_time - start_time:.2f} seconds")

Epoch 1/10


I0000 00:00:1722366158.593006  100400 service.cc:146] XLA service 0x7f04a8003760 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722366161.076444  100400 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
2024-07-30 21:04:28.765330: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
W0000 00:00:1722366273.618073  100400 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2024-07-30 21:04:37.308221: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
2024-07-30 21:04:51.311886: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.45GiB with freed_by_count=0. The caller indicates that this

[1m1487/1488[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 379ms/step - accuracy: 0.7456 - loss: 0.9259

W0000 00:00:1722366885.358959  100397 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2024-07-30 21:14:56.322634: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.43GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 404ms/step - accuracy: 0.7456 - loss: 0.9258

W0000 00:00:1722367012.034015  100398 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m961s[0m 508ms/step - accuracy: 0.7457 - loss: 0.9257 - val_accuracy: 0.7389 - val_loss: 0.9220
Epoch 2/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m704s[0m 411ms/step - accuracy: 0.8309 - loss: 0.5705 - val_accuracy: 0.8016 - val_loss: 0.6688
Epoch 3/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m649s[0m 433ms/step - accuracy: 0.8501 - loss: 0.4857 - val_accuracy: 0.8137 - val_loss: 0.6001
Epoch 4/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m648s[0m 431ms/step - accuracy: 0.8637 - loss: 0.4309 - val_accuracy: 0.8031 - val_loss: 0.6493
Epoch 5/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m649s[0m 432ms/step - accuracy: 0.8766 - loss: 0.3876 - val_accuracy: 0.8065 - val_loss: 0.6695
Epoch 6/10
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m671s[0m 447ms/step - accuracy: 0.8832 - loss: 0.3618 - val_accuracy: 0.8358 - val_loss: 0.5366
Epo

In [3]:
tf.keras.backend.clear_session()
model.load_weights('bisenet_best_model.weights.h5')

dummy_batch = next(iter(test_dataset))
dummy_image = dummy_batch[0]
model.predict(dummy_image, verbose=0)  # Warm-up the model

total_time=0

for i in range(5):
    start_time = time.time()

    for batch in test_dataset:
        image = batch[0]
        predictions = model.predict(image, verbose=0)

    end_time = time.time()
    print("run", i+1)
    print(f"Inference Time: {end_time - start_time:.2f} seconds")
    print(f"Number of Images: {len(test_dataset)}")
    print(f"Average Inference Time per Image: {(end_time - start_time)/len(test_dataset)*1000:.0f}ms")
    print("----------------")
    total_time += end_time - start_time

print(f"Average Inference Time: {(end_time - start_time)/5:.2f} seconds")
print(f"Number of Images: {len(test_dataset)}")
print(f"Average Inference Time per Image: {total_time/5/len(test_dataset)*1000:.0f}ms")

  saveable.load_own_variables(weights_store.get(inner_path))
I0000 00:00:1722460104.187568    2388 service.cc:146] XLA service 0x7facac001e00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722460104.190271    2388 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
2024-07-31 23:08:24.689046: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-31 23:08:26.577830: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
I0000 00:00:1722460116.212194    2388 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2024-07-31 23:12:02.163996: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


run 1
Inference Time: 205.89 seconds
Number of Images: 1525
Average Inference Time per Image: 135ms
----------------


2024-07-31 23:16:33.803920: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


run 2
Inference Time: 271.62 seconds
Number of Images: 1525
Average Inference Time per Image: 178ms
----------------
run 3
Inference Time: 276.89 seconds
Number of Images: 1525
Average Inference Time per Image: 182ms
----------------


2024-07-31 23:25:34.585213: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


run 4
Inference Time: 263.88 seconds
Number of Images: 1525
Average Inference Time per Image: 173ms
----------------
run 5
Inference Time: 299.17 seconds
Number of Images: 1525
Average Inference Time per Image: 196ms
----------------
Average Inference Time: 59.83 seconds
Number of Images: 1525
Average Inference Time per Image: 173ms
