## LSTM method for Motion prediction

Objective of the competition is to predict the future trajectories of other vehicles using the past information (Bird's eye view of the scene containing agents detected by perception system, past trajectories lane information, traffic lights etc).


Since this is sequential problem, I thought of using LSTM based models. Basic idea is as follows:

[LSTM_baseline idea](https://www.kaggle.com/suryajrrafl/lstm-baseline-weights?select=lstm+baseline+idea.jpg)

This is my first attempt at kaggle competition, pytorch and LSTM models. Suggestions are most welcome.


**REFERENCES**

Some helper functions in this notebook were taken from the great public kernels avaiable. 

[Great reference notebook using Resnet model](https://www.kaggle.com/huanvo/lyft-complete-train-and-prediction-pipeline)

[Pytorch baseline train](https://www.kaggle.com/pestipeti/pytorch-baseline-train)

## Import libraries

In [1]:
# torch imports
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, SubsetRandomSampler
from torchvision.models.resnet import resnet50, resnet18, resnet34, resnet101
import torch.functional as F

# l5kit imports
import l5kit
from l5kit.configs import load_config_data
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv, compute_metrics_csv, read_gt_csv, create_chopped_dataset
from l5kit.evaluation.chop_dataset import MIN_FUTURE_STEPS
from l5kit.evaluation.metrics import neg_multi_log_likelihood, time_displace
from l5kit.geometry import transform_points
from l5kit.visualization import PREDICTED_POINTS_COLOR, TARGET_POINTS_COLOR, draw_trajectory

# common imports
import os
import random
import time
import pandas as pd
from typing import Dict
from tempfile import gettempdir
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from prettytable import PrettyTable
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [2]:
l5kit.__version__

'1.1.0'

In [3]:
torch.cuda.is_available()

False

## Helper functions

### Loss function

In [4]:
# --- Function utils ---
# Original code from https://github.com/lyft/l5kit/blob/20ab033c01610d711c3d36e1963ecec86e8b85b6/l5kit/l5kit/evaluation/metrics.py
from torch import Tensor


def pytorch_neg_multi_log_likelihood_batch(
    gt: Tensor, pred: Tensor, confidences: Tensor, avails: Tensor
) -> Tensor:
    """
    Compute a negative log-likelihood for the multi-modal scenario.
    log-sum-exp trick is used here to avoid underflow and overflow, For more information about it see:
    https://en.wikipedia.org/wiki/LogSumExp#log-sum-exp_trick_for_log-domain_calculations
    https://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
    https://leimao.github.io/blog/LogSumExp/
    Args:
        gt (Tensor): array of shape (bs)x(time)x(2D coords)
        pred (Tensor): array of shape (bs)x(modes)x(time)x(2D coords)
        confidences (Tensor): array of shape (bs)x(modes) with a confidence for each mode in each sample
        avails (Tensor): array of shape (bs)x(time) with the availability for each gt timestep
    Returns:
        Tensor: negative log-likelihood for this example, a single float number
    """
    assert len(pred.shape) == 4, f"expected 3D (MxTxC) array for pred, got {pred.shape}"
    batch_size, num_modes, future_len, num_coords = pred.shape

    assert gt.shape == (batch_size, future_len, num_coords), f"expected 2D (Time x Coords) array for gt, got {gt.shape}"
    assert confidences.shape == (batch_size, num_modes), f"expected 1D (Modes) array for gt, got {confidences.shape}"
    assert torch.allclose(torch.sum(confidences, dim=1), confidences.new_ones((batch_size,))), "confidences should sum to 1"
    assert avails.shape == (batch_size, future_len), f"expected 1D (Time) array for gt, got {avails.shape}"
    # assert all data are valid
    assert torch.isfinite(pred).all(), "invalid value found in pred"
    assert torch.isfinite(gt).all(), "invalid value found in gt"
    assert torch.isfinite(confidences).all(), "invalid value found in confidences"
    assert torch.isfinite(avails).all(), "invalid value found in avails"

    # convert to (batch_size, num_modes, future_len, num_coords)
    gt = torch.unsqueeze(gt, 1)  # add modes
    avails = avails[:, None, :, None]  # add modes and cords

    # error (batch_size, num_modes, future_len)
    error = torch.sum(((gt - pred) * avails) ** 2, dim=-1)  # reduce coords and use availability

    with np.errstate(divide="ignore"):  # when confidence is 0 log goes to -inf, but we're fine with it
        # error (batch_size, num_modes)
        error = torch.log(confidences) - 0.5 * torch.sum(error, dim=-1)  # reduce time

    # use max aggregator on modes for numerical stability
    # error (batch_size, num_modes)
    max_value, _ = error.max(dim=1, keepdim=True)  # error are negative at this point, so max() gives the minimum one
    error = -torch.log(torch.sum(torch.exp(error - max_value), dim=-1, keepdim=True)) - max_value  # reduce modes
    # print("error", error)
    return torch.mean(error)


def pytorch_neg_multi_log_likelihood_single(
    gt: Tensor, pred: Tensor, avails: Tensor
) -> Tensor:
    """

    Args:
        gt (Tensor): array of shape (bs)x(time)x(2D coords)
        pred (Tensor): array of shape (bs)x(time)x(2D coords)
        avails (Tensor): array of shape (bs)x(time) with the availability for each gt timestep
    Returns:
        Tensor: negative log-likelihood for this example, a single float number
    """
    # pred (bs)x(time)x(2D coords) --> (bs)x(mode=1)x(time)x(2D coords)
    # create confidence (bs)x(mode=1)
    batch_size, future_len, num_coords = pred.shape
    confidences = pred.new_ones((batch_size, 1))
    return pytorch_neg_multi_log_likelihood_batch(gt, pred.unsqueeze(1), confidences, avails)

### Random seed generation function

In [5]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
set_seed(42)

### Resnet forward function

In [6]:
def resnet_forward(backbone, x):    
    #with torch.set_grad_enabled(False):
    with torch.no_grad():
        x = backbone.conv1(x)
        x = backbone.bn1(x)
        x = backbone.relu(x)
        x = backbone.maxpool(x)

        x = backbone.layer1(x)
        x = backbone.layer2(x)
        x = backbone.layer3(x)
        x = backbone.layer4(x)

        x = backbone.avgpool(x)
        x = torch.flatten(x, 1)
    return x

### Number of trainable parameters in model

In [7]:
def find_no_of_trainable_params(model):
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    #print(total_trainable_params)
    return total_trainable_params

### LSTM input creation function

In [8]:
def LSTM_batch_transform(image_data, base_model):    
    
    BATCH_SIZE = image_data.shape[0]
    
    """ LANES, TRAFFIC LIGHT DATA ENCODING """
    infra_data = image_data[:, -3:, :, :]
    infra_data = resnet_forward(base_model, infra_data)
    infra_data = torch.repeat_interleave(infra_data.unsqueeze(1), NUMBER_OF_HISTORY_FRAMES, dim=1)
    #print(infra_data.shape)
    
    """ EGO, AGENT VEHICLE DATA ENCODING """
    # agent frames
    agent_data = image_data[:, 0:NUMBER_OF_HISTORY_FRAMES, :, :]
    #print(agent_data.shape)

    # ego vehicle frames
    ego_data = image_data[:, NUMBER_OF_HISTORY_FRAMES:-3, :,:]
    #print(ego_data.shape)

    # combined ego and agent frames, duplicating across 3 channels
    vehicle_data = torch.repeat_interleave(ego_data + agent_data, 3, dim=1)

    # pretrained model requires (batch_size, 3, 224, 224), hence reshaping
    vehicle_data = vehicle_data.view(-1, 3, RASTER_IMG_SIZE, RASTER_IMG_SIZE)
    #print(vehicle_data.shape)

    # passing through model and reshaping
    history_vehicle_data = resnet_forward(base_model, vehicle_data)
    history_vehicle_data =  history_vehicle_data.view(BATCH_SIZE, -1, 512)
    #print(history_vehicle_data.shape)
    
    """concatenating history_vehicle_data and infra_data """
    LSTM_input = torch.cat((history_vehicle_data, infra_data), dim=-1)
    #print(f'LSTM input shape is {temp.shape}')
    
    return LSTM_input

### Model Forward pass function

In [9]:
def forward(data, model, hidden_state, device, criterion = pytorch_neg_multi_log_likelihood_batch):
    inputs = data["image"].to(device)
    target_availabilities = data["target_availabilities"].to(device)
    targets = data["target_positions"].to(device)
    batch_size = inputs.shape[0]
    
    # converting image data to sequential data for LSTM model
    LSTM_input = LSTM_batch_transform(inputs, encoding_model)
    
    # LSTM model prediction and confidence
    prediction, hidden_state = model(LSTM_input, hidden_state)
    hidden_state = (hidden_state[0].data, hidden_state[1].data)
    prediction, confidences = torch.split(prediction, 300, dim=1)
    prediction = prediction.view(batch_size, 3, 50, 2)
    confidences = torch.softmax(confidences, dim=1)
    
    # calculating NLL loss 
    loss = pytorch_neg_multi_log_likelihood_batch(targets, prediction, confidences, target_availabilities)
    
    return loss, hidden_state, prediction, confidences

## Base LSTM Structure

In [10]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, time_steps, use_LSTM = False):
        super(RNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.time_steps = time_steps
        
        # define an RNN with specified parameters
        # batch_first means that the first dim of the input and output will be the batch_size
        
        if use_LSTM == True:
            self.rnn = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True)
        else:
            self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        
        
        # last, fully-connected layer
        self.fc = nn.Linear(time_steps * hidden_dim, output_size)

    def forward(self, x, hidden):
        # x (batch_size, time_step, input_size)
        # hidden (n_layers, batch_size, hidden_dim)
        # r_out (batch_size, time_step, hidden_size)
        batch_size = x.size(0)
        
        # get RNN outputs
        r_out, hidden = self.rnn(x, hidden)
        # shape output to be (batch_size*time_step, hidden_dim)
        r_out = r_out.reshape(batch_size,-1)  
        
        # get final output 
        output = self.fc(r_out)
        
        return output, hidden

## Configs

In [12]:
# --- Lyft configs ---
cfg = {
    'format_version': 4,
    'data_path': "../../dataset",
    'model_params': {
        'model_architecture': 'LSTM',
        'history_num_frames': 10,
        'history_step_size': 1,
        'history_delta_time': 0.1,
        'future_num_frames': 50,
        'future_step_size': 1,
        'future_delta_time': 0.1,
        'model_name': "LSTM_baseline_r34",
        'weight_path': "/kaggle/input/lstm-baseline-weights/LSTM_baseline_r34_9750.pth",
        'lr': 1e-3,
        'train': True,
        'predict': False
    },

    'raster_params': {
        'raster_size': [224, 224],
        'pixel_size': [0.5, 0.5],
        'ego_center': [0.25, 0.5],
        'map_type': 'py_semantic',
        'satellite_map_key': 'aerial_map/aerial_map.png',
        'semantic_map_key': 'semantic_map/semantic_map.pb',
        'dataset_meta_key': 'meta.json',
        'filter_agents_threshold': 0.5
    },

    'train_data_loader': {
        'key': 'scenes/train.zarr',
        'batch_size': 16,
        'shuffle': True,
        'num_workers': 4
    },
    
    'test_data_loader': {
        'key': 'scenes/test.zarr',
        'batch_size': 32,
        'shuffle': False,
        'num_workers': 4
    },

    'sample_data_loader': {
        'key': 'scenes/sample.zarr',
        'batch_size': 16,
        'shuffle': True,
        'num_workers': 4
    },

    'train_params': {
        'train_start_index' : 9751,
        'max_num_steps': 12002,
        'checkpoint_every_n_steps': 500,
    }
}

### Constants used for training

In [13]:
NUMBER_OF_HISTORY_FRAMES = cfg['model_params']['history_num_frames'] + 1
RASTER_IMG_SIZE = cfg['raster_params']['raster_size'][0]
NUM_MODES = 3
NUMBER_OF_FUTURE_FRAMES = cfg['model_params']['future_num_frames']

### TRAIN FROM WHERE LEFT OFF, CHANGE THE STARTING INDICES VARIABLE ACCORDINGLY
TRAIN_START_INDICES = cfg['train_params']['train_start_index']

## Load the training data

In [14]:
# set env variable for data
DIR_INPUT = cfg["data_path"]
os.environ["L5KIT_DATA_FOLDER"] = DIR_INPUT
dm = LocalDataManager(None)
rasterizer = build_rasterizer(cfg, dm)

In [15]:
# ===== INIT TRAIN DATASET============================================================
train_cfg = cfg["train_data_loader"]
train_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open()
train_dataset = AgentDataset(cfg, train_zarr, rasterizer)

starting pool...


16265/16265: : 16265it [11:38, 23.28it/s]


collecting results..
start report for ../../dataset/scenes/train.zarr
{   'reject_th_AV_distance': 40138602,
    'reject_th_agent_filter_probability_threshold': 214185074,
    'reject_th_extent': 13797837,
    'reject_th_yaw': 824397,
    'th_agent_filter_probability_threshold': 0.5,
    'th_distance_av': 50,
    'th_extent_ratio': 1.1,
    'th_yaw_degree': 30,
    'total_agent_frames': 320124624,
    'total_reject': 268945910}


computing past/future table: 100%|██████████| 4/4 [00:21<00:00,  5.45s/it]


+-------------+-----------+----------+----------+---------+
| past/future |     0     |    10    |    30    |    50   |
+-------------+-----------+----------+----------+---------+
|      0      | 320124624 | 23355429 | 13208858 | 8725660 |
|      10     |  23355429 | 17003687 | 10609787 | 7335740 |
|      30     |  13208858 | 10609787 | 7335740  | 5400884 |
|      50     |  8725660  | 7335740  | 5400884  | 4073267 |
+-------------+-----------+----------+----------+---------+
end report for ../../dataset/scenes/train.zarr


In [16]:
print('Length of Train dataset is ' ,len(train_dataset))
print("==================================TRAIN DATA==================================")
print(train_dataset)

Length of Train dataset is  22496709
+------------+------------+------------+---------------+-----------------+----------------------+----------------------+----------------------+---------------------+
| Num Scenes | Num Frames | Num Agents | Num TR lights | Total Time (hr) | Avg Frames per Scene | Avg Agents per Frame | Avg Scene Time (sec) | Avg Frame frequency |
+------------+------------+------------+---------------+-----------------+----------------------+----------------------+----------------------+---------------------+
|   16265    |  4039527   | 320124624  |    38735988   |      112.19     |        248.36        |        79.25         |        24.83         |        10.00        |
+------------+------------+------------+---------------+-----------------+----------------------+----------------------+----------------------+---------------------+


In [17]:
len(train_dataset)

22496709

In [18]:
sampled_indices = np.random.choice(len(train_dataset), size = len(train_dataset), replace = False)
print('Before slicing, start indices are ', sampled_indices[0:10])

Before slicing, start indices are  [ 4544818 11783688  2821987 10532781  1993967 15711361  7726097   547516
 17556178  7275034]


In [19]:
TRAIN_START_INDICES

9751

In [20]:
sampled_indices = sampled_indices[TRAIN_START_INDICES:]
print('After slicing, start indices are ', sampled_indices[0:10])

After slicing, start indices are  [17691590  1626288 17600386  5600948 14398697 14287128  1693504 13532906
 14185562 22001214]


In [21]:
Datasampler = SubsetRandomSampler(sampled_indices)

In [22]:
train_dataloader = DataLoader(train_dataset, sampler=Datasampler, batch_size=train_cfg["batch_size"], 
                             num_workers=train_cfg["num_workers"])

## CUDA device && encoding model

In [23]:
# ==== INIT MODEL=================
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device {device}')

device cpu


In [24]:
encoding_model = resnet34(pretrained=True)
encoding_model.to(device);

# Freeze parameters so we don't backprop through them
for param in encoding_model.parameters():
    param.requires_grad = False

Total_trainable_params = find_no_of_trainable_params(encoding_model)
print(f'There are {Total_trainable_params} trainable parameters in the model')

# set to evaluation mode
encoding_model.eval();

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /Users/jamestaylor/.cache/torch/hub/checkpoints/resnet34-333f7ec4.pth


HBox(children=(FloatProgress(value=0.0, max=87306240.0), HTML(value='')))


There are 0 trainable parameters in the model


## LSTM model, Optimiser, criterion

In [25]:
# decide on hyperparameters
input_size   = 1024 
output_size  = 303
hidden_dim   = 64
n_layers     = 2

In [26]:
# instantiate an RNN
model = RNN(input_size, output_size, hidden_dim, n_layers, 11, use_LSTM=True)
model.to(device)
#print(LSTM_baseline_model)

total_params = find_no_of_trainable_params(model)
print(f'There are {total_params} parameters in the LSTM model')

There are 525935 parameters in the LSTM model


In [27]:
## loading the pretrained weights
model.load_state_dict(torch.load(cfg['model_params']['weight_path']))

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/lstm-baseline-weights/LSTM_baseline_r34_9750.pth'

In [29]:
## Adam optimiser function
optimizer = optim.Adam(model.parameters(), lr=cfg["model_params"]["lr"])

## Training

In [30]:
# ==== TRAINING LOOP =========================================================
if cfg["model_params"]["train"]:
    
    tr_it = iter(train_dataloader)
    progress_bar = tqdm(range(TRAIN_START_INDICES, 
                              TRAIN_START_INDICES + cfg["train_params"]["max_num_steps"]))
    num_iter = cfg["train_params"]["max_num_steps"]
    losses_train = []
    iterations = []
    metrics = []
    times = []
    model_name = cfg["model_params"]["model_name"]
    start = time.time()
    hidden_state = None
    
    for i in progress_bar:
        try:
            data = next(tr_it)
        except StopIteration:
            tr_it = iter(train_dataloader)
            data = next(tr_it)
            
        # Forward pass
        model.train()
        torch.set_grad_enabled(True)
        loss, hidden_state, _, _ = forward(data, model, hidden_state, device)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses_train.append(loss.item())

        progress_bar.set_description(f"loss: {loss.item()} loss(avg): {np.mean(losses_train)}")
        if i % cfg['train_params']['checkpoint_every_n_steps'] == 0:
            torch.save(model.state_dict(), f'{model_name}_{i + TRAIN_START_INDICES}.pth')
            iterations.append(i)
            metrics.append(np.mean(losses_train))
            times.append((time.time()-start)/60)
    
    results = pd.DataFrame({'iterations': iterations, 'metrics (avg)': metrics, 'elapsed_time (mins)': times})
    results.to_csv(f"train_metrics_{model_name}_{num_iter}.csv", index = False)
    train_losses_csv = pd.DataFrame({'iteration': TRAIN_START_INDICES + np.arange(len(losses_train)), 
                                 'losses_train': losses_train})
    train_losses_csv.to_csv(f"train_losses_{model_name}_{num_iter}.csv", index = False)
    print(f"Total training time is {(time.time()-start)/60} mins")
    print(results.head())

KeyboardInterrupt: 

In [None]:
tr_it = iter(train_dataloader)

next(tr_it)