# Object Detection with Street View House Numbers

### Loading Libraries

In [26]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualizaion
import seaborn as sns
from PIL import Image
import PIL.Image as Image
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Warnings
import warnings

# OS & Path
import os
import sys
import tarfile
from pathlib import Path

# Scikit-Learn
from sklearn.metrics import confusion_matrix

# TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import (Flatten,
                                     Dense,
                                     BatchNormalization,
                                     Activation,
                                     Concatenate)



In [28]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')

if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

Using GPU


In [30]:
%matplotlib inline

In [32]:
sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

In [34]:
data_path = Path('images', 'svhn')

In [36]:
results_path = Path('results', 'svhn')

if not results_path.exists():
    results_path.mkdir()

#### Settings

In [39]:
IMG_SIZE = 32

IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)

SEQ_LENGTH = 4

N_CLASSES = 11

### Loading Data

In [44]:
# X_train = np.load(data_path / 'X_train.npy')

# y_train = np.load(data_path / 'y_train.npy')

In [48]:
# X_train = X_train[y_train[:, 0] < 5]

# y_train = y_train[y_train[:, 0] < 5, :5]
# y_train[:, 0] -= 1

In [50]:
# X_test = np.load(data_path / 'X_test.npy')

# y_test = np.load(data_path / 'y_test.npy')

In [52]:
# X_test = X_test[y_test[:, 0] < 5]

# y_test = y_test[y_test[:, 0] < 5, :5]
# y_test[:, 0] -= 1

In [54]:
# pd.DataFrame(y_train).nunique()

In [56]:
# y_train.shape

### Best Architecture

In [59]:
digit_pos = {1: [4, 14], 2: [14, 25], 3: [25, 36], 4: [36, 47]}

In [61]:
def weighted_accuracy(y_true, y_pred):
    n_digits_pred = K.argmax(y_pred[:, :SEQ_LENGTH], axis=1)

    digit_preds = {}
    for digit, (start, end) in digit_pos.items():
        digit_preds[digit] = K.argmax(y_pred[:, start:end], axis=1)
    preds = tf.dtypes.cast(tf.stack((n_digits_pred,
                                     digit_preds[1],
                                     digit_preds[2],
                                     digit_preds[3],
                                     digit_preds[4]), axis=1), tf.float32)

    return K.mean(K.sum(tf.dtypes.cast(K.equal(y_true, preds), tf.int64), axis=1) / 5)

In [63]:
def weighted_entropy(y_true, y_pred):
    cce = tf.keras.losses.SparseCategoricalCrossentropy()
    n_digits = y_pred[:, :SEQ_LENGTH]

    digits = {}
    for digit, (start, end) in digit_pos.items():
        digits[digit] = y_pred[:, start:end]
    return (cce(y_true[:, 0], n_digits) +
            cce(y_true[:, 1], digits[1]) +
            cce(y_true[:, 2], digits[2]) +
            cce(y_true[:, 3], digits[3]) +
            cce(y_true[:, 4], digits[4])) / 5

In [65]:
vgg16 = VGG16(input_shape=IMG_SHAPE, include_top=False, weights='imagenet')

vgg16.trainable = False

x = vgg16.output
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(256)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(128)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
n_digits = Dense(SEQ_LENGTH, activation='softmax', name='n_digits')(x)
digit1 = Dense(N_CLASSES-1, activation='softmax', name='d1')(x)
digit2 = Dense(N_CLASSES, activation='softmax', name='d2')(x)
digit3 = Dense(N_CLASSES, activation='softmax', name='d3')(x)
digit4 = Dense(N_CLASSES, activation='softmax', name='d4')(x)
predictions = Concatenate()([n_digits, digit1, digit2, digit3, digit4])

model = Model(inputs=vgg16.input, outputs=predictions)

2025-05-03 09:18:36.568258: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Max
2025-05-03 09:18:36.568414: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2025-05-03 09:18:36.568418: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2025-05-03 09:18:36.568434: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-03 09:18:36.568450: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [67]:
model.compile(optimizer='adam',
              loss=weighted_entropy,
              metrics=[weighted_accuracy])

### Defining Callbacks

In [70]:
svhn_path = (results_path / 'svhn.weights.best.hdf5').as_posix()

checkpointer = ModelCheckpoint(filepath=svhn_path, 
                               verbose=1, 
                               monitor='val_weighted_accuracy',
                               save_best_only=True)

In [76]:
early_stopping = EarlyStopping(monitor='val_weighted_accuracy', 
                               patience=5)

### Training Transfer Model

In [79]:
epochs = 50

In [83]:
result = model.fit(x=X_train,
                    y=y_train,
                    validation_split=.1,
                    batch_size=32,
                    epochs=epochs,
                    verbose=1,
                    callbacks=[checkpointer, early_stopping])

#### Evaluating Results

In [94]:
metrics = pd.DataFrame(result.history)

initial_epochs = len(metrics)

In [96]:
y_pred = model.predict(X_test, verbose=1)

In [98]:
n_digits = y_pred[:, :SEQ_LENGTH]

digits = {}

for digit, (start, end) in digit_pos.items():
    digits[digit] = y_pred[:, start:end]

In [100]:
(y_test[:, 0] == np.argmax(n_digits, axis=1)).sum()/len(n_digits)

In [102]:
confusion_matrix(y_true=y_test[:, 0], y_pred=np.argmax(n_digits, axis=1))

In [115]:
accuracy = np.zeros_like(y_test)

accuracy[:, 0] = (y_test[:, 0] == np.argmax(n_digits, axis=1))

for i in range(1, 5):
    accuracy[:, i] = (y_test[:, i] == np.argmax(digits[i], axis=1))

In [117]:
acc_by_output = {}

for i in range(5):
    acc_by_output[i] = accuracy[:, i].sum()/accuracy[:, i].shape[0]

In [119]:
acc_up_to_out = {}

for i in range(1, 6):
    r = accuracy[:, :i].all(1)
    acc_up_to_out[i-1] = r.sum()/r.shape[0]

In [121]:
pd.DataFrame({'acc1': acc_by_output, 'acc2': acc_up_to_out}).plot.bar()
sns.despine();
plt.show()

### Fine Tuning VGG16 Weights

In [124]:
vgg16.trainable = True

start_fine_tuning_at = 1

for layer in vgg16.layers[:start_fine_tuning_at]:
    layer.trainable =  False

In [126]:
model.compile(optimizer='adam',
              loss=weighted_entropy,
              metrics=[weighted_accuracy])

In [128]:
fine_tune_epochs = 50

total_epochs = initial_epochs + fine_tune_epochs

In [130]:
result_fine_tune = model.fit(x=X_train,
                              y=y_train,
                              validation_split=.1,
                              batch_size=32,
                              epochs=total_epochs,
                              initial_epoch=history.epoch[-1],
                              callbacks=[early_stopping])

In [132]:
metrics_tuned = metrics.append(pd.DataFrame(result_fine_tune.history), ignore_index=True)

In [134]:
metrics_tuned.info()

In [136]:
fig, axes = plt.subplots(ncols=2, figsize=(15, 4))
metrics_tuned[['loss', 'val_loss']].plot(ax=axes[1], title='Cross-Entropy Loss')
metrics_tuned[['weighted_accuracy', 'val_weighted_accuracy']].plot(ax=axes[0], title=f'Accuracy (Best: {metrics_tuned.val_weighted_accuracy.max():.2%})')
axes[0].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
axes[0].set_ylabel('Accuracy')
axes[1].set_ylabel('Loss')
for ax in axes:
    ax.axvline(14, ls='--', lw=1, c='k')
    ax.legend(['Training', 'Validation', 'Start Fine Tuning'])
    ax.set_xlabel('Epoch')
fig.tight_layout()
fig.savefig(results_path / 'transfer_learning_svhn');
plt.show