# Module 3 — Program B: CNN Variants for Simple Object Detection (Synthetic MNIST Boxes)

**Aim:** Build **two CNN variants** for multi‑task learning (digit classification + bounding‑box regression) and compare performance.  
**Covers:** Deeper vs regular CNN backbones, padding/stride impact on localization, multi‑head loss (CE + SmoothL1/MSE), evaluation via **accuracy** and **IoU**.

> Dataset: **Synthetic MNIST‑on‑Canvas** — each 64×64 image contains one MNIST digit placed at a random location with ground‑truth bbox.


In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# Load MNIST
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

def make_canvas_set(x, y, n=12000, img_size=64, digit_size=20, seed=42):
    rng = np.random.default_rng(seed)
    X = np.zeros((n, img_size, img_size), dtype='float32')
    cls = np.zeros((n,), dtype='int32')
    bbox = np.zeros((n, 4), dtype='float32')  # x,y,w,h normalized [0,1]
    for i in range(n):
        idx = rng.integers(0, x.shape[0])
        digit = tf.image.resize(x[idx][..., None].astype('float32'), (digit_size, digit_size)).numpy()[...,0]/255.0
        H, W = img_size, img_size
        h, w = digit.shape
        top = rng.integers(0, H - h + 1)
        left = rng.integers(0, W - w + 1)
        X[i, top:top+h, left:left+w] = digit
        cls[i] = y[idx]
        # bbox as center x,y and width,height in [0,1]
        cx = (left + w/2)/W; cy = (top + h/2)/H; bw = w/W; bh = h/H
        bbox[i] = [cx, cy, bw, bh]
    X = X[..., None]
    return X, cls, bbox

# Build train/val/test synthetic sets (fast demo sizes; increase for stronger results)
Xtr, ctr, btr = make_canvas_set(x_train, y_train, n=12000)
Xte, cte, bte = make_canvas_set(x_test,  y_test,  n=2000, seed=7)

print('Train:', Xtr.shape, ctr.shape, btr.shape)
print('Test:',  Xte.shape, cte.shape, bte.shape)

# Quick visualization
fig, ax = plt.subplots(1,3, figsize=(9,3))
for i in range(3):
    ax[i].imshow(Xtr[i,...,0], cmap='gray'); ax[i].axis('off')
plt.show()

## 1) Model Variants: Baseline vs Deeper (with BN/Dropout)

In [None]:
from tensorflow.keras import layers, models

def head_multitask(x):
    # classification head
    cls = layers.Dense(10, activation='softmax', name='cls')(x)
    # bbox head (cx, cy, w, h) in [0,1]
    bbox = layers.Dense(4, activation='sigmoid', name='bbox')(x)
    return cls, bbox

def backbone_baseline(input_shape=(64,64,1)):
    inp = layers.Input(input_shape)
    x = layers.Conv2D(16, 3, padding='same', activation='relu')(inp)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(32, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(64, 3, padding='same', activation='relu')(x)
    x = layers.GlobalAveragePooling2D()(x)
    cls, bbox = head_multitask(x)
    m = models.Model(inp, [cls, bbox], name='baseline')
    return m

def backbone_deeper(input_shape=(64,64,1)):
    inp = layers.Input(input_shape)
    x = layers.Conv2D(32, 3, padding='same', use_bias=False)(inp); x = layers.BatchNormalization()(x); x = layers.Activation('relu')(x)
    x = layers.Conv2D(32, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(64, 3, padding='same', use_bias=False)(x); x = layers.BatchNormalization()(x); x = layers.Activation('relu')(x)
    x = layers.Conv2D(64, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(128, 3, padding='same', activation='relu')(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.3)(x)
    cls, bbox = head_multitask(x)
    m = models.Model(inp, [cls, bbox], name='deeper_bn_do')
    return m

def compile_multitask(model, lr=1e-3, bbox_loss='mse', lw_bbox=5.0):
    model.compile(optimizer=tf.keras.optimizers.Adam(lr),
                  loss={'cls':'sparse_categorical_crossentropy', 'bbox': bbox_loss},
                  loss_weights={'cls':1.0, 'bbox':lw_bbox},
                  metrics={'cls':'accuracy'})
    return model

m1 = compile_multitask(backbone_baseline())
m2 = compile_multitask(backbone_deeper())
m1.summary(); m2.summary()

## 2) Training & Evaluation (Accuracy + IoU)

In [None]:
def iou_boxes(b1, b2):
    # b1, b2 in [cx, cy, w, h] normalized; convert to (x1,y1,x2,y2) then IoU
    def to_xyxy(b):
        cx, cy, w, h = b[...,0], b[...,1], b[...,2], b[...,3]
        x1 = cx - w/2; y1 = cy - h/2; x2 = cx + w/2; y2 = cy + h/2
        return x1, y1, x2, y2
    x1a, y1a, x2a, y2a = to_xyxy(b1)
    x1b, y1b, x2b, y2b = to_xyxy(b2)
    xi1, yi1 = np.maximum(x1a, x1b), np.maximum(y1a, y1b)
    xi2, yi2 = np.minimum(x2a, x2b), np.minimum(y2a, y2b)
    inter = np.clip(xi2 - xi1, 0, 1) * np.clip(yi2 - yi1, 0, 1)
    area_a = np.clip(x2a - x1a, 0, 1) * np.clip(y2a - y1a, 0, 1)
    area_b = np.clip(x2b - x1b, 0, 1) * np.clip(y2b - y1b, 0, 1)
    union = area_a + area_b - inter + 1e-9
    return (inter / union)

EPOCHS = 6
BATCH = 128

hist = {}
for name, model in [('baseline', m1), ('deeper', m2)]:    
    h = model.fit(Xtr, {'cls': ctr, 'bbox': btr},
                  validation_split=0.1, epochs=EPOCHS, batch_size=BATCH, verbose=1)
    hist[name] = h

# Evaluate
results = {}
for name, model in [('baseline', m1), ('deeper', m2)]:
    preds = model.predict(Xte, batch_size=256, verbose=0)
    cls_pred = np.argmax(preds[0], axis=1)
    bbox_pred = preds[1]
    acc = (cls_pred == cte).mean()
    miou = iou_boxes(bbox_pred, bte).mean()
    results[name] = {'acc': float(acc), 'miou': float(miou)}

print('Results (approx):', {k: {m: round(v,4) for m,v in d.items()} for k,d in results.items()})

In [None]:
# Plot validation accuracy for both variants
fig, ax = plt.subplots()
for name, h in hist.items():
    ax.plot(h.history['val_cls_accuracy'], label=f'{name} cls')
ax.set_xlabel('Epoch'); ax.set_ylabel('Val Acc'); ax.set_title('Classification Accuracy (val)')
ax.legend(); plt.show()

# Bar chart of final metrics
names = list(results.keys())
accs = [results[n]['acc'] for n in names]
mious= [results[n]['miou'] for n in names]

fig, ax = plt.subplots()
ax.bar(np.arange(len(names))-0.15, accs, width=0.3, label='Acc')
ax.bar(np.arange(len(names))+0.15, mious, width=0.3, label='mIoU')
ax.set_xticks(np.arange(len(names))); ax.set_xticklabels(names)
ax.set_title('Variant Comparison: Accuracy vs mIoU (test)')
ax.legend(); plt.show()

### Result & Inference (to be written)
- Compare **baseline vs deeper**: which hits higher **accuracy** and **IoU** and why?
- Discuss how **stride/pooling** may affect localization quality.
- Suggest one improvement (e.g., focal loss, anchor‑based boxes, data augmentation).
