### **Mount your Google Drive.**
Make sure your workspace containing processed data is added to the drive. <br>

**Navigate to Runtime and change the settings to T4 GPU**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

---

#### **You can run the below cells to verify whether the folder exists on the drive**

In [None]:
%cd /content/drive/MyDrive/hw10_workspace/src/model_generation
!ls

---

### **Install dependencies, and import necessary libraries**



In [None]:
import os, math, argparse, torch, random
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms as T
from PIL import Image

from dataset import make_loaders         # from dataset.py
from model import ImageOnlySteerNet      # from model.py

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

---

### **Data Augmentation And Preprocessing Pipeline**
In this section, you define how the input images are transformed
before being fed into the neural network.
There are two parts:<br>
- `make_eval_tf()` - transformations for validation/test (NO augmentation)
- `make_train_tf()` - transformations for training (WITH augmentation)

The idea is to help the model generalize better to unseen lighting
conditions, camera shifts, or image distortions by applying random
transformations during training.
Run the below cell, you donot have to make any changes here.

In [None]:
# ------------------ Augmentations ------------------
class TopCrop(torch.nn.Module):
    def __init__(self, frac: float): super().__init__(); self.frac = max(0.0, min(1.0, float(frac)))
    def forward(self, im: Image.Image):
        if self.frac <= 0: return im
        w, h = im.size; cut = int(h * self.frac)
        return im.crop((0, cut, w, h))

def make_eval_tf(short_side: int, top_crop: float):
    return T.Compose([
        TopCrop(top_crop),
        T.Resize(short_side),          # keeps aspect ratio (short side -> short_side)
        T.CenterCrop(short_side),      # make square for ResNet
        T.ToTensor(),
        T.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])

def make_train_tf(args):
    return T.Compose([
        TopCrop(args.top_crop),
        T.Resize(args.short_side),
        T.CenterCrop(args.short_side),
        T.ColorJitter(brightness=args.jitter_b,
                      contrast=args.jitter_c,
                      saturation=args.jitter_s,
                      hue=args.jitter_h),
        T.RandomAffine(
            degrees=args.affine_deg,
            translate=(args.affine_trans, args.affine_trans),
            scale=(args.affine_scale_min, args.affine_scale_max),
            interpolation=T.InterpolationMode.BILINEAR,
            fill=0,
        ),
        T.ToTensor(),
        T.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
# ---------------------------------------------------

---

### **Training Pipeline Set-Up**
#### MODEL EVALUATION FUNCTION
This helper function evaluates the trained model on a dataset.
It computes two metrics:
- Mean Absolute Error (MAE)
- Root Mean Squared Error (RMSE)
You do NOT need to modify this function.

#### Helper: get_lr()
Returns the current learning rate from the optimizer.
Used for TensorBoard logging.
You do NOT need to modify this function.

In [None]:
@torch.no_grad()
def evaluate(model, loader, device, mu, sigma):
    model.eval()
    se = mae = n = 0
    for x, _, y_std, y_raw, _ in loader:
        x = x.to(device, non_blocking=True)
        y_std = y_std.to(device, non_blocking=True)
        yhat_std = model(x)                 # [B]
        yhat_raw = yhat_std * sigma + mu    # [B]
        diff = (yhat_raw.cpu() - y_raw)     # [B]
        mae += diff.abs().sum().item()
        se  += (diff**2).sum().item()
        n   += y_raw.shape[0]
    mae /= max(1, n)
    rmse = math.sqrt(se / max(1, n))
    return mae, rmse

def get_lr(optimizer):
    for pg in optimizer.param_groups:
        return pg.get("lr", None)

---

### **EXPERIMENT SETUP**

The main training script below is where we: <br>
- Define hyperparameters
- Prepare dataloaders
- Initialize model, optimizer, and loss function
- Configure TensorBoard logging

For the first iteration, you can keep the default values of the hyperparameters and train the model.


In [None]:
# ------------------ TODO: Define Hyperparameters ------------------
epochs = 1             # Number of epochs to train the model, try values ranging from 10-40 epochs
lr = 1                  # Learning Rate - value range to try -> (1e-3 to 1e-5)
wd = 1                  # Weight Decay - try values from -> (1e-3 to 1e-5)
dropout = 0.1              # Percentage of dropout
# -----------------------------------------------------------------
bs = 64                    # Batch Size for training
use_aug = True             # Flag for use of augmentation
seed = 42                  # Random seed
fp16 = True
pretrained = True
freeze_backbone = False

logdir = "runs/image_only"
ckpt_out = "ckpt_best.pt"

random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
args = argparse.Namespace(
    epochs=epochs,
    lr=lr,
    batch_size=bs,
    wd = wd,
    dropout=dropout,
    fp16=fp16
)


The datset setup function is loaded from the dataset.py file. Add the paths of the merged_dataset **index_smooth.json** file and the path of the **root directory of merged_dataset** in the code snippet below.

In [None]:
# ============================================================
# Dataset Setup
# ============================================================
omega_sign = +1.0
train_dl, val_dl, test_dl, stats = make_loaders(
    index_json="path/to/your/index_smooth.json",
    root="path/to/your/merged_dataset",
    bs=bs,
    hist_len=0,
    omega_sign=omega_sign,
    short_side=224,
    top_crop_frac=0.2
    )
mu, sigma = stats["mu"], stats["sigma"]
print(f"Label standardization: mu={mu:.6f}, sigma={sigma:.6f}")

Let us now apply data augmentation to increase the variability in the training data.

*   top crop: Crops the top part of the frame as most of the tracks are on the bottom of the image. Try experimenting with the values within the provided range. You have define the percentage of the frame you want to crop
*   You can also try out different degrees of affine transformations
*   Among the available data augmentation techniques (rotation, translation, scaling, color jitter, and flipping), which transformations are expected to be most effective for improving model generalization in a vision-based self-driving context, and why?

* In particular, would applying vertical or horizontal flips be appropriate for this dataset, given the robot’s camera viewpoint and driving environment?

* Similarly, is scaling a suitable augmentation, or could it distort geometric relationships that are important for predicting steering angles?

In [None]:
# ============================================================
# Data Augmentations
# ============================================================
eval_tf  = make_eval_tf(224, 0.2)
train_tf = make_train_tf(argparse.Namespace(
    top_crop=...., #  0.1-0.3
    short_side=224,
    jitter_b= ...., # try values between 0.10-0.20
    jitter_c= ...., # try values between 0.10-0.20
    jitter_s= ...., # try values between 0.10-0.20
    jitter_h=0.02, # try values between 0.0 -0.5
    affine_deg=...., # try values between 0 - 10
    affine_trans=0.02, #
    affine_scale_min=0.95,
    affine_scale_max=1.05
)) if use_aug else eval_tf

train_dl.dataset.img_tf = train_tf
if val_dl:
    val_dl.dataset.img_tf = eval_tf
if test_dl:
    test_dl.dataset.img_tf = eval_tf

print(f"Augmentations: {'ON' if use_aug else 'OFF'} | top_crop=20% | short_side=224")

The definition of the model is provided below. Fill in the details on the optimizer to be used, loss function and the scaler.

In [None]:
# ============================================================
# Model / Optimizer / Loss / AMP
# ============================================================
model = ImageOnlySteerNet(
    out_len=1,
    pretrained=pretrained,
    freeze_backbone=freeze_backbone,
    dropout=dropout,
).to(device)

opt     = ##### initialize an optimizer for ur model. (Hint: AdamW is commonly used for Vision models)#####
loss_fn = ##### import a suitable regression loss function from pytorch #####
scaler  = ##### initialize gradscaler function#####

writer = SummaryWriter(logdir)
writer.add_text("hparams", str({
    "lr": lr, "bs": bs, "epochs": epochs,
    "use_aug": use_aug, "dropout": dropout
}))
writer.add_scalar("data/mu", mu, 0)
writer.add_scalar("data/sigma", sigma, 0)

best_mae = float("inf")
global_step = 0


### **TRAINING LOOP**
In this section, you will complete the model training loop.
The code below iterates through multiple epochs and performs:<br>
- Forward pass (model prediction)
- Loss computation
- Backward pass (gradient computation)
- Optimizer step (parameter update)
- Logging metrics to TensorBoard

You are expected to:
- Implement the forward pass and loss calculation (TODOs below)
- Understand how mixed precision (autocast) and GradScaler work
- Track MAE and loss across batches

 ============================================================

### **VALIDATION, CHECKPOINT, AND TEST EVALUATION**
After each epoch of training, we evaluate model performance
on the validation (or training) set, log results to TensorBoard,
and save the best model checkpoint based on lowest MAE.
Finally, once training completes, we evaluate the final model
on the test dataset to estimate its generalization performance.
You must:<br>
• Understand how evaluate() is used to measure MAE/RMSE<br>
• Observe when and why checkpoints are saved
• Record test metrics and analyze how well the model learned

In [None]:
# ============================================================
# Training Loop
# ============================================================
for epoch in range(1, epochs + 1):
    model.train()
    running_loss = 0.0
    running_mae  = 0.0
    n_seen = 0

    # Iterate over all batches
    for x, _, y_std, y_raw, _ in train_dl:
        x = x.to(device, non_blocking=True)
        y_std = y_std.to(device, non_blocking=True)

        opt.zero_grad(set_to_none=True)
        with autocast(enabled=fp16):
                yhat_std = ...... # pass input images to perform forward pass
                loss = ........ # compute loss

        ## Perform backward pass and update steps #####

        # Log current batch loss and learning rate
        writer.add_scalar("train/step_loss", loss.item(), global_step)
        lr_now = get_lr(opt)
        if lr_now is not None:
            writer.add_scalar("train/lr", lr_now, global_step)

        # Compute metrics (MAE in raw units)
        with torch.no_grad():
            yhat_raw = (yhat_std * sigma + mu).cpu()
            mae_batch = (yhat_raw - y_raw).abs().sum().item()
            running_mae  += mae_batch
            running_loss += loss.item() * y_std.shape[0]
            n_seen += y_std.shape[0]

        global_step += 1

    # ============================================================
    # Compute epoch averages
    # ============================================================
    train_loss = running_loss / max(1, n_seen)
    train_mae  = running_mae  / max(1, n_seen)

    writer.add_scalar("train/epoch_loss", train_loss, epoch)
    writer.add_scalar("train/epoch_MAE",  train_mae,  epoch)

    # ---- Validate ----
    if val_dl is not None:
            val_mae, val_rmse = evaluate(model, val_dl, device, mu, sigma)
    else:
            val_mae, val_rmse = evaluate(model, train_dl, device, mu, sigma)

    writer.add_scalar("val/MAE",  val_mae,  epoch)
    writer.add_scalar("val/RMSE", val_rmse, epoch)

    print(f"[{epoch:03d}/{epochs}] "
              f"train_loss={train_loss:.4f}  train_MAE={train_mae:.4f}  "
              f"val_MAE={val_mae:.4f}  val_RMSE={val_rmse:.4f}")

    # ---- Checkpoint best by MAE ----
    if val_mae < best_mae:
            best_mae = val_mae
            torch.save({
                "model_state": model.state_dict(),
                "mu": mu,
                "sigma": sigma,
                "omega_sign": omega_sign,
                "epoch": epoch,
                "val_mae": val_mae,
                "val_rmse": val_rmse,
                "args": vars(args),
            }, ckpt_out)
            print(f"  ↳ Saved best checkpoint to {ckpt_out} (MAE={val_mae:.4f})")
            writer.add_scalar("ckpt/best_MAE", best_mae, epoch)


You have succesfully completed the training of the model. You can now vary the values of the following hyperparameters and study its effect on the model performance: <br>
- Epochs
- Learning Rate
- Weight Decay
- Dropout

The values of the other hyper parameters need not be modified.

Select the combination of hyperparameters that you think gives the best performance in terms of **Training Loss** and **Training Time**.

---

In [None]:
   # ---- Test ----
if test_dl is not None:
    test_mae, test_rmse = evaluate(model, test_dl, device, mu, sigma)
    print(f"[TEST] MAE={test_mae:.4f}  RMSE={test_rmse:.4f}")
    writer.add_scalar("test/MAE",  test_mae,  args.epochs)
    writer.add_scalar("test/RMSE", test_rmse, args.epochs)
else:
    print("No test split found; skipping test evaluation.")

writer.close()