In [1]:
from pycocotools.coco import COCO
import matplotlib
import matplotlib.pyplot as plt
import os
import open3d as o3d
import cv2
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as functions
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import re

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
DATADIR = "cocodoom/"

dataSplit, run = "run-full-test", "run3"

annFile = '{}{}.json'.format(DATADIR,dataSplit)

In [3]:
coco = COCO(annFile)

loading annotations into memory...
Done (t=13.70s)
creating index...
index created!


<h1>Heatmap Code</h1>
This code is used to generate the heatmap - the only function you need to use is generateHeatmap, which takes in the most recent depth map and a motion vector - and returns the heatmap and also a point cloud of only the points visible from the new location (which is very helpful for verification).

Providing the depth map and motion vector to this function can be done quite easily from the example at the bottom of the notebook.

In [4]:
def getSegmentationMask(rgb_filename):
    return rgb_filename.replace("rgb", "objects")

def getDepthMask(rgb_filename):
    return rgb_filename.replace("rgb", "depth")

In [5]:
def toPointCloud(depth):
    # https://github.com/mmatl/pyrender/issues/14#issuecomment-485881479 was used as reference
    height, width = depth.shape
    #print(depth.shape)
    fy = 200 / np.tan(1.5708 * 0.5)
    fx = 320 / np.tan(1.5708 * 0.5)

    depth = depth / 64.0
    mask = np.where(depth > 0)

    #print(depth.max(axis=1))

    x = mask[1]
    y = mask[0]

    normalized_x = (x.astype(np.float32) - width * 0.5) #/ width
    normalized_y = (y.astype(np.float32) - height * 0.5) #/ height
    
    world_x = normalized_x * depth[y, x] / fx #* 1000
    world_y = normalized_y * depth[y, x] / fy #* 1000
    world_z = depth[y, x]
    ones = np.ones(world_z.shape[0], dtype=np.float32)

    return np.vstack((world_x, world_y, world_z)).T

In [6]:
def visiblePoints(point_cloud, motion_vector):
    """
    Takes in the projected point cloud and motion vector and 
    generates a heatmap on the original image. Also produces
    a point cloud of only the points visible after movement.
    """
    angle = motion_vector[3]
    rotation_matrix = np.array([
        [np.cos(angle), 0, -1 * np.sin(angle)],[0, 1, 0],[np.sin(angle), 0, np.cos(angle)]
        ])
    #print(point_cloud.shape)
    translated_points = point_cloud - np.array([motion_vector[1], motion_vector[2], motion_vector[0]])
    #print(translated_points.shape)
    #print(rotation_matrix.shape)
    transformed_points = np.zeros(translated_points.shape)
    # This has been split up as the kernel kept dying
    for i, point in enumerate(translated_points):
        transformed_points[i] = rotation_matrix @ point

    fy = 200 / np.tan(1.5708 * 0.5)
    fx = 320 / np.tan(1.5708 * 0.5)
    x_proj = (transformed_points[:, 0] * fx / transformed_points[:, 2]) + 320 * 0.5
    y_proj = (transformed_points[:, 1] * fy / transformed_points[:, 2]) + 200 * 0.5

    #print("Made it here")

    output = np.zeros((200, 320)) # These might need to be switched
    visible_point = []
    for i, (x, y, z) in enumerate(transformed_points):
        if z > 0 and 0 <= x_proj[i] < 320 and 0 <= y_proj[i] < 200:
            output[i // 320, i % 320] = 1
            visible_point.append(transformed_points[i])

    return output, np.array(visible_point)

In [7]:
def generateHeatmap(depth, motion_vector):
    """
    Takes in the path to the depth map and the predicted or 
    actual motion vector to generate a heatmap.

    The path to the depth map should be structured as:
    runX/mapXX/depth/XXXXXX.png
    """

    point_cloud = toPointCloud(cv2.imread(DATADIR + getDepthMask(depth), cv2.IMREAD_UNCHANGED))
    heatmap, visible_point_cloud = visiblePoints(point_cloud, motion_vector)

    return heatmap, visible_point_cloud

In [8]:
#heatmap, visible_point_cloud = generateHeatmap(img["file_name"], combined_motion_vectors)
#plt.imsave('binary_image.png', heatmap, cmap='gray', format='png')

NameError: name 'img' is not defined

<h1>Dataset</h1>
This code converts the various CocoDoom inputs into better formats to use - example usage of this is provided below.

In [9]:
player_positions = {"run1":[], "run2":[], "run3":[]}
motion_vectors = {"run1":[], "run2":[], "run3":[]}
USED_RUNS = ["run3"]
for run in USED_RUNS:
    with open(DATADIR+run+"/log.txt", 'r') as log_file:
        for line in log_file:
            if "player" in line:
                line = line.strip()
                tic, stats = line.split("player:")
                x, y, z, angle = stats.split(",")
    
                # Store position in the dictionary
                player_positions[run].append([float(x), float(y), float(z), float(angle)])
                if len(player_positions[run]) >= 2:
                    player_position = player_positions[run][-1]
                    prev_player_position = player_positions[run][-2]
                    
                    dx = player_position[0] - prev_player_position[0]
                    dy = player_position[1] - prev_player_position[1]
                    dz = player_position[2] - prev_player_position[2]
                    dangle = np.pi - abs(abs(player_position[3] - prev_player_position[3]) - np.pi)
                    
                    dx_relative = dx * np.cos(2 * np.pi - prev_player_position[3]) + dy * np.cos(prev_player_position[3] - 1/2 * np.pi)
                    dy_relative = dx * np.sin(2 * np.pi - prev_player_position[3]) + dy * np.sin(prev_player_position[3] - 1/2 * np.pi)
                    motion_vector = [dx_relative, dy_relative, dz, dangle]
                    motion_vectors[run].append(motion_vector)

In [10]:
class DoomMotionDataset(Dataset):
    def __init__(self, coco, run, input_window, prediction_window, transform=None):
        self.coco = coco
        self.run = run
        self.img_ids = self.coco.getImgIds()
        self.transform = transform
        self.input_window = input_window
        self.prediction_window = prediction_window

    def __len__(self):
        return len(self.img_ids)

    def fullSegmentationFormat(self, rgb_filename):
        seg_image = self.load_image(self.getSegmentationMask(DATADIR + rgb_filename))
        if seg_image == None:
            return seg_image
        seg_class_map = self.color_to_index(seg_image)
        seg_class_one_hot = functions.one_hot(seg_class_map, num_classes=4).to(dtype=torch.float).permute(2, 0, 1)
        return seg_class_one_hot

    def fullDepthFormat(self, rgb_filename):
        depth_mask = self.load_image(self.getDepthMask(DATADIR + rgb_filename))
        if depth_mask == None:
            return depth_mask
        depth_mask = torch.tensor(depth_mask, dtype=torch.float32)
        return depth_mask

    def getSegmentationMask(self, rgb_filename):
        return rgb_filename.replace("rgb", "objects")

    def getDepthMask(self, rgb_filename):
        return rgb_filename.replace("rgb", "depth")

    def color_to_index(self, segmentation_image):
        # Map colors to class indices
        r, g, b = segmentation_image
        pixel_values = r + (g *  2**8) + (b * 2**16)  # From cocodoom documentation, converts to an object id

        class_map = torch.full_like(pixel_values, 3, dtype=torch.long)

        sky = (1 << 23) + 0
        horizontal = (1 << 23) + 1
        vertical = (1 << 23) + 2
        
        class_map[x == sky] = 0
        class_map[x == horizontal] = 1
        class_map[x == vertical] = 2
        return class_map

    def load_image(self, path):
        if os.path.exists(path):
            img = Image.open(path)
            return transforms.ToTensor()(img)
        return None

    def __getitem__(self, idx):
        # Load the RGB image
        rgb_filename = self.coco.loadImgs(self.img_ids[idx])[0]['file_name']
        #print(rgb_filename)
        tic = int(rgb_filename.replace(".png", "").split("/")[-1])
        next_tic = tic+1
        previous_tic = tic-1
        prev_motion_vectors = []
        next_motion_vectors = []
        prev_seg = []
        prev_dep = []

        for t in range(input_window, 0, -1):
            if tic-t < 0:
                prev_motion_vectors.append(motion_vectors[self.run][0])
                prev_filename = self.coco.loadImgs(self.img_ids[0])[0]['file_name']
                seg = self.fullSegmentationFormat(prev_filename)
                dep = self.fullDepthFormat(prev_filename)
                prev_seg.append(seg)
                prev_dep.append(dep)
                continue
            elif tic-t >= len(motion_vectors[self.run]):
                prev_motion_vectors.append(motion_vectors[self.run][-1])
                prev_filename = self.coco.loadImgs(self.img_ids[-1])[0]['file_name']
                seg = self.fullSegmentationFormat(prev_filename)
                dep = self.fullDepthFormat(prev_filename)
                prev_seg.append(seg)
                prev_dep.append(dep)
                continue
            prev_motion_vectors.append(motion_vectors[self.run][tic-t])
            prev_filename = rgb_filename[:-10] + str(max(tic - t, 2)).rjust(6, "0") + ".png"
            # run1/map01/rgb/000002.png
            if os.path.exists(DATADIR + prev_filename):
                seg = self.fullSegmentationFormat(prev_filename)
                #print(f"seg shape: {seg.shape}")
                dep = self.fullDepthFormat(prev_filename)
                #print(f"dep shape: {dep.shape}")
                prev_seg.append(seg)
                prev_dep.append(dep)
            else:
                prev_seg.append(torch.zeros((4, 200, 320)))
                prev_dep.append(torch.zeros((1, 200, 320)))
                

        for t in range(1, prediction_window+1):
            if tic+t >= len(motion_vectors[self.run]):
                next_motion_vectors.append(motion_vectors[self.run][-1])
                continue
            next_motion_vectors.append(motion_vectors[self.run][tic+t])

            
        prev_motion_vectors = torch.tensor(prev_motion_vectors, dtype=torch.float32)
        next_motion_vectors = torch.tensor(next_motion_vectors, dtype=torch.float32)
        #print(len(prev_seg))
        prev_seg = torch.stack(prev_seg)
        prev_dep = torch.stack(prev_dep)
        
        return {"prev_motion" : prev_motion_vectors, "next_motion" : next_motion_vectors, "previous_seg" : prev_seg, "previous_dep" : prev_dep}


<h1>Model</h1>
This is the code that defines the main model to be used. Example usage of this to predict is provided below.

In [11]:
class NeuralNetwork(nn.Module):
  def __init__(self, batch_size, input_length, sequence_length, activation_function=functions.relu, device=torch.device("cpu")):
    super(NeuralNetwork, self).__init__()
    self.batch_size = batch_size
    self.input_length = input_length
    self.sequence_length = sequence_length

    # Encoder
    # Conv layers
    self.conv_seg = nn.Conv2d(4, 1, kernel_size=3, stride=2, padding=1, bias=False).to(device)
    self.conv_dep = nn.Conv2d(1, 1, kernel_size=3, stride=2, padding=1, bias=False).to(device)

    self.motion_fc = nn.Linear(4, 32).to(device)
      
    # Pre-fusion LSTMs
    self.vis_LSTM = nn.LSTM(input_size=32000, hidden_size=256, batch_first=True).to(device)
    self.inertia_LSTM = nn.LSTM(input_size=32, hidden_size=256, batch_first=True).to(device)

    # Fusion LSTM
    self.fusion_LSTM = nn.LSTM(input_size=512, hidden_size=256, batch_first=True).to(device)

    # Decoder
    self.de_motion_fc = nn.Linear(4, 32).to(device)
    self.de_vis_LSTM = nn.LSTM(input_size=32, hidden_size=256, batch_first=True).to(device) #Unsure what the input size of this should be as it actually receives nothing
    self.de_inertia_LSTM = nn.LSTM(input_size=32, hidden_size=256, batch_first=True).to(device)
    self.de_fusion_LSTM = nn.LSTM(input_size=512, hidden_size=256, batch_first=True).to(device)
    self.output_fc = nn.Linear(256, 4).to(device)

  def forward(self, segmentation, depth, prev_motion):
    hidden_vis = None
    hidden_inert = None
    hidden_fus = None
    
    for t in range(self.input_length):
        #print(segmentation.shape)
        seg = self.conv_seg(segmentation[:,t])
        #print(seg.shape)
        dep = self.conv_dep(depth[:,t])
        #print(dep.shape)
        mot = self.motion_fc(prev_motion[:,t])
        vis = torch.cat((seg, dep), dim=1)
        vis = torch.flatten(vis, start_dim=1)
        #print(vis.shape)
        if hidden_vis != None:
            output_vis, hidden_vis = self.vis_LSTM(vis, hidden_vis)
        else:
            output_vis, hidden_vis = self.vis_LSTM(vis)
        if hidden_inert != None:
            output_inert, hidden_inert = self.inertia_LSTM(mot, hidden_inert)
        else:
            output_inert, hidden_inert = self.inertia_LSTM(mot)
        combined = torch.cat((output_vis, output_inert), dim=1)
        if hidden_fus != None:
            _, hidden_fus = self.fusion_LSTM(combined, hidden_fus)
        else:
            _, hidden_fus = self.fusion_LSTM(combined)

    #print("Prev motion: " + str(prev_motion.shape))
    de_mot = prev_motion[:,-1]
    output_tensor = torch.zeros(self.sequence_length, segmentation.size(0), 4).to(segmentation.device)
    for t in range(self.sequence_length):
        #print(de_mot.shape)
        de_mot = self.de_motion_fc(de_mot)
        de_output_inert, hidden_inert = self.de_inertia_LSTM(de_mot, hidden_inert)
        de_output_vis, hidden_vis = self.de_vis_LSTM(torch.zeros(segmentation.size(0), 32).to(segmentation.device), hidden_vis)
        #print(de_output_vis.shape, de_output_inert.shape)
        combined = torch.cat((de_output_vis, de_output_inert), dim=1)
        de_output_fus, hidden_fus = self.de_fusion_LSTM(combined, hidden_fus)
        #print("de_output_fus: " + str(de_output_fus.shape))
        output_t = self.output_fc(de_output_fus)
        #print("output_t: " + str(output_t.shape))
        #output_t = output_t.unsqueeze(0)
        de_mot = output_t
        output_tensor[t] = output_t.unsqueeze(0)
        
    return output_tensor

<h1>Example loading from the dataset and making predictions</h1>

This is an example way to load from the dataset and make predictions from the model.

Key parameters here are the input window (how many frames before prediction to give as context to the model), and the prediction window (how many frames into the future to predict).

The output of the model is a prediction window amount of motion vectors. The first motion vector in the prediction window represents the motion vector between the last seen frame and the first prediction, the second motion vector in the prediction window represents the motion vector between the last prediction, and the next prediction. This is important to distinguish from each motion vector output representing the difference between the last observed frame and position. Of course calculating this value just consists of adding up the predicted motion vectors up to and including the one we are interested in.

Using this summed value of the previous predicted motion vectors needs to be done when generating a heatmap - as otherwise it will not properly represent the predicted movement between the last observed frame.

The next_motion variable provides the ground-truth motion vectors, these are structured in the same way as the output of the model.

In [27]:
batch_size = 1
input_window = 5
prediction_window = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("cuda" if torch.cuda.is_available() else "cpu")

model = NeuralNetwork(batch_size, input_window, prediction_window, device=device).to(device)
model.load_state_dict(torch.load("multimodal_seq2seq.pth", weights_only=True))

test_dataset = DoomMotionDataset(coco, run, input_window, prediction_window)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()

progress_bar = tqdm(test_loader, desc="Testing", unit="batch")

with torch.no_grad():  # Disable gradient calculations for evaluation
    for batch_idx, batch in enumerate(progress_bar):
        prev_motion, next_motion, previous_seg, previous_dep = batch["prev_motion"], batch["next_motion"], batch["previous_seg"], batch["previous_dep"]
        prev_motion, next_motion, previous_seg, previous_dep = prev_motion.to(device), next_motion.to(device), previous_seg.to(device), previous_dep.to(device)

        if prev_motion.size(0) != next_motion.size(0) != previous_seg.size(0) != previous_dep.size(0):
                continue
            
        outputs = model(previous_seg, previous_dep, prev_motion)
        outputs = outputs.permute(1, 0, 2)

        depth_map = previous_dep[0][-1]

        # Generate heatmaps here
        
        progress_bar.set_postfix({
            "batch_index": batch_idx + 1,
            "batch_size": prev_motion.size(0)
        })

cuda


  depth_mask = torch.tensor(depth_mask, dtype=torch.float32)
Testing:   0%| | 158/118138 [00:09<2:01:33, 16.18batch/s, batch_index=158, batch


KeyboardInterrupt: 