# Synthetic multimodal data
This notebook generates synthetic indoor point clouds and S3DIS-like images to experiment with multimodal mapping and visualize results. This can be helpful if you want to visualize how the mapping works, the impact of image transforms and how 3D and 2D subsampling affect the mappings.

In [None]:
# Select you GPU
I_GPU = 0

In [None]:
# Uncomment to use autoreload
# %load_ext autoreload
# %autoreload 2

import os
import sys
import numpy as np
import torch
import glob

torch.cuda.set_device(I_GPU)
DIR = os.path.dirname(os.getcwd())
ROOT = os.path.join(DIR, "..")
sys.path.insert(0, ROOT)
sys.path.insert(0, DIR)

from torch_points3d.visualization.multimodal_data import visualize_mm_data

# Generate Data samples

Generates room-like boxes with different colors for walls, ceilings and floor, to facilitate subsequent visualizations. Not that you can tune this to 3D spaces of your taste and change the voxel resolution.

In [None]:
from torch_points3d.core.multimodal import *
from torch_points3d.core.data_transform.transforms import RandomNoise
from torch_geometric.data import Data

n_data = 1        # number of "rooms" to generate
n_points = 10**5  # number of points per "room"
height = 2.5      # floor-to-ceiling height
width = 4         # width of the "room"

data_list = []
for i in range(n_data):
    
    # Offset
    offset = torch.Tensor([[width * 2 * i, width * 2 * i, 0]])
    
    # XYZ
    floor = torch.cat([torch.rand(n_points, 1) * width, torch.rand(n_points, 1) * width, torch.zeros(n_points, 1)], dim=1)
    ceiling = torch.cat([torch.rand(n_points, 1) * width, torch.rand(n_points, 1) * width, torch.ones(n_points, 1) * height], dim=1)
    wall_1 = torch.cat([torch.rand(n_points, 1) * width, torch.zeros(n_points, 1), torch.rand(n_points, 1) * height], dim=1)
    wall_2 = torch.cat([torch.rand(n_points, 1) * width, torch.ones(n_points, 1) * width, torch.rand(n_points, 1) * height], dim=1)
    wall_3 = torch.cat([torch.zeros(n_points, 1), torch.rand(n_points, 1) * width, torch.rand(n_points, 1) * height], dim=1)
    wall_4 = torch.cat([torch.ones(n_points, 1) * width, torch.rand(n_points, 1) * width, torch.rand(n_points, 1) * height], dim=1)
    xyz = torch.cat([floor, ceiling, wall_1, wall_2, wall_3, wall_4], dim=0) + offset

    # RGB
    floor = torch.Tensor([1, 0, 0]).view(1, -1).repeat(n_points, 1)    # red
    ceiling = torch.Tensor([0, 1, 0]).view(1, -1).repeat(n_points, 1)  # green
    wall_1 = torch.Tensor([0, 0, 1]).view(1, -1).repeat(n_points, 1)   # blue
    wall_2 = torch.Tensor([1, 1, 0]).view(1, -1).repeat(n_points, 1)   # 
    wall_3 = torch.Tensor([0, 1, 1]).view(1, -1).repeat(n_points, 1)   # 
    wall_4 = torch.Tensor([1, 0, 1]).view(1, -1).repeat(n_points, 1)   # 
    rgb = torch.cat([floor, ceiling, wall_1, wall_2, wall_3, wall_4], dim=0)
    
    # Y
    y = torch.repeat_interleave(torch.arange(6, dtype=torch.long), n_points)
    
    # Add noise to the data
    data = RandomNoise(sigma=0.001, clip=0.05)(Data(pos=xyz, rgb=rgb, y=y))
    
    # Convert to Data object
    data_list.append(data)

# Preprocess 3D Data

Basic preprocessings required to prepare mapping and mapping features computation.

In [None]:
from torch_points3d.core.data_transform import *

voxel = 0.05  # voxel grid resolution

data_list = GridSampling3D(size=voxel, setattr_full_pos=True, quantize_coords=True, mode='last')(data_list)
data_list = SaveOriginalPosId(key='mapping_index')(data_list)
data_list = PCAComputePointwise(num_neighbors=50, r=2*voxel, use_full_pos=True)(data_list)
data_list = EigenFeatures(norm=True, linearity=True, planarity=True, scattering=True, temperature=None)(data_list)
data_list = RemoveAttributes(attr_names=['full_pos', 'eigenvalues', 'eigenvectors'])(data_list)
data_list = AddFeatsByKeys(list_add_to_x=[True] * 4, feat_names=['norm', 'linearity', 'planarity', 'scattering'], delete_feats=[True] * 4)(data_list)
# data_list = AddFeatsByKeys(list_add_to_x=[False, False, True, False], feat_names=['norm', 'linearity', 'planarity', 'scattering'], delete_feats=[False] * 4)(data_list)

# Generate synthetic ImageData

Generate random camera poses.

In [None]:
ref_size = (512, 256)  # pixel resolution of the images
n_images_in = 4        # number of images inside of each "room"
n_images_out = 4       # number of images outside of each "room"
n_images = n_images_in + n_images_out

image_data_list = []
for i in range(n_data):
    
    # Recover the offset of the cloud
    offset = data_list[i].pos.min(dim=0).values
    
    # Images inside of the rooms
    xyz_in = torch.cat([
        torch.rand(n_images_in, 1) * width * 4 / 5 + width / 10,
        torch.rand(n_images_in, 1) * width * 4 / 5 + width / 5,
        torch.rand(n_images_in, 1) * height * 2 / 3 + height / 6], dim=1)
    xyz_in += offset

    # Images outside of the rooms
    radius = 1.5 * width + torch.rand(n_images_out)
    theta = torch.rand(n_images_out) * 2 * np.pi
    z = torch.rand(n_images_out, 1) * height * 3 / 2
    xyz_out = offset.view(1, -1) + torch.cat([
        (torch.cos(theta) * radius).view(-1, 1),
        (torch.sin(theta) * radius).view(-1, 1),
        z], dim=1)

    # Convert to SameSettingImageData
    path = np.array([''] * n_images)
    pos = torch.cat([xyz_in, xyz_out], dim=0)
    opk = torch.zeros(n_images, 3)
    image_data_list.append(SameSettingImageData(
        path=path, pos=pos, opk=opk, ref_size=ref_size, proj_upscale=2, 
        downscale=1, voxels=voxel, r_max=30, r_min=0.3, growth_k=0.2,
        growth_r=100))

# Project 3D Data onto ImageData to generate mappings

This is where the mapping and associated features are actually computed.

In [None]:
from torch_points3d.core.data_transform.multimodal.image import *

r_max = 10                        # maximum point-camera distance for the mappings
r_min = 0.2                       # minimum point-camera distance for the mappings 
k_list = [50]                     # number of neighbors used for neighborhood-based mapping features (eg density, occlusion)
exact = True                      # False: points are mapped to their whole z-buffering patch (denser mapping). True: only to the center (more accurate mapping)  
use_cuda = False                  # whether to use cuda to accelerate mapping computation
camera = 's3dis_equirectangular'  # camera model used (keep s3dis_equirectangular for this notebook)

data_list, image_data_list = MapImages(r_min=r_min, r_max=r_max, exact=exact, use_cuda=use_cuda, camera=camera)(data_list, image_data_list)
# image_data_list[0].mappings.features = None  # uncomment to visualize only densities and occlusions
data_list, image_data_list = NeighborhoodBasedMappingFeatures(k=k_list, voxel=voxel, density=True, occlusion=True, use_faiss=False, use_cuda=use_cuda)(data_list, image_data_list)

# Populate the image features with 3D point's RGB

Create synthetic views of the point cloud from the generated camera poses, by z-buffering the voxel splats. In other words, the mapping is used to propagate the points' RGB colors to pixels and thus create a rendering of the 3D scene for each pose.

In [None]:
for data, images in zip(data_list, image_data_list):
    
    # Initialize backgrounds to black
    x = torch.zeros(n_images, 3, images.img_size[1], images.img_size[0]).byte()
    images.x = x
        
    for i, im in enumerate(images):
        # Get the mapping of all points in the sample
        idx = im.mappings.feature_map_indexing
        idx = (torch.ones_like(idx[0]) * i, *idx[1:])
    
        # Set mapping mask to point cloud RGB colors
        color = (data.rgb * 255).type(torch.uint8)
        color = torch.repeat_interleave(
            color,
            im.mappings.pointers[1:] - im.mappings.pointers[:-1],
            dim=0)
        color = torch.repeat_interleave(
            color,
            im.mappings.values[1].pointers[1:]
            - im.mappings.values[1].pointers[:-1],
            dim=0)
        
        # Apply the coloring to the mapping masks
        images.x[idx] = color

# Preprocess the ImageData

Apply some transforms to the images.

In [None]:
data_list, image_data_list = RandomHorizontalFlip(p=0.5)(data_list, image_data_list)
data_list, image_data_list = ToFloatImage()(data_list, image_data_list)
data_list, image_data_list = CenterRoll(angular_res=16)(data_list, image_data_list)
data_list, image_data_list = CropImageGroups(padding=2, min_size=32)(data_list, image_data_list)
data_list, image_data_list = PickImagesFromMemoryCredit(img_size=ref_size, n_img=4, k_coverage=0)(data_list, image_data_list)

# Visualize the MMData

Check out the results in an interactive visualization.

In [None]:
from torch_points3d.visualization.multimodal_data import visualize_mm_data

OBJECT_COLOR = [
    [95, 156, 196],   # floor      -> blue
    [233, 229, 107],  # ceiling    -> yellow
    [179, 116, 81],   # wall_1     -> brown    
    [108, 135, 75],   # wall_2     -> dark green
    [41, 49, 101],    # wall_3     -> darkblue
    [223, 52, 52],    # wall_4     -> red
    [0, 0, 0],        # unlabelled -> black
    ]

CLASSES = [
    'floor',
    'ceiling',
    'wall_1',
    'wall_2',
    'wall_3',
    'wall_4',
    'unlabelled',
]

In [None]:
mm_data = MMBatch.from_mm_data_list([
    MMData(data, image=images) 
    for data, images in zip(data_list, image_data_list)])

visualize_mm_data(mm_data, class_names=CLASSES, class_colors=OBJECT_COLOR, figsize=1000, voxel=0.05, show_3d=True, show_2d=True, color_mode='light', alpha=0.3, pointsize=5)

# Image features Multi-View Pooling visualization

We propose to visualize here the impact of a simple mean-pool on the image features of each point's multiple views. As opposed to our DeepViewAgg, this approach does not learn to use the observation conditions (mapping features) to adress the multi-view problem.

In [None]:
from torch_points3d.modules.multimodal.modules import UnimodalBranch
from torch_points3d.modules.multimodal.fusion import BimodalFusion
from torch_points3d.modules.multimodal.pooling import BimodalCSRPool

branch = UnimodalBranch(None, BimodalCSRPool(mode='max'), BimodalCSRPool(mode='mean'), BimodalFusion(mode='residual'))
mm_data_dict = {
    'x_3d': torch.zeros_like(mm_data.data.rgb),
    'x_seen': None,
    'modalities': {'image': mm_data.modalities['image'].clone()}}
mm_data_dict = branch.forward(mm_data_dict, 'image')

In [None]:
mm_data_out = mm_data.clone()
mm_data_out.data.rgb = mm_data_dict['x_3d']

visualize_mm_data(mm_data_out, class_names=CLASSES, class_colors=OBJECT_COLOR, figsize=1000, voxel=0.05, show_3d=True, show_2d=True, color_mode='light', alpha=0.3, pointsize=5)

# 3D downsampling visualization with TorchSparse module

Visualize the impact of 3D downsampling on the mapping. Here the downsampling is performed using a 3D strided conv with kernel `downsampling_3d x downsampling_3d x downsampling_3d`).

In [None]:
from torch_points3d.modules.multimodal.modules import MultimodalBlockDown, UnimodalBranch
from torch_points3d.modules.multimodal.fusion import BimodalFusion
from torch_points3d.modules.multimodal.pooling import BimodalCSRPool
from torch_points3d.modules.SparseConv3d.modules import ResNetDown
from torch_points3d.modules.SparseConv3d.nn.torchsparse import SparseTensor, Conv3d

# 3D resolution downscaling factor
downsampling_3d = 8

class SomeDownConv3D(torch.nn.Module):
    def __init__(self, stride, bias=False):
        super(SomeDownConv3D, self).__init__()
        
        conv_1 = Conv3d(in_channels=3, out_channels=3, kernel_size=1, stride=1, bias=bias)
        conv_2 = Conv3d(in_channels=3, out_channels=3, kernel_size=stride, stride=stride, bias=bias)
        
        with torch.no_grad():
            conv_1.kernel = torch.nn.Parameter(torch.zeros_like(conv_1.kernel))
            conv_2.kernel = torch.nn.Parameter(torch.zeros_like(conv_2.kernel))
            for i in range(3):
                conv_1.kernel[i, i] = 1
                for j in range(stride**2):
                    conv_2.kernel[j, i, i] = 1
                
        self.conv_1 = conv_1
        self.conv_2 = conv_2
    
    def forward(self, x, *args, **kwargs):
        return self.conv_2(self.conv_1(x))
    
conv_3d = SomeDownConv3D(downsampling_3d)
branch = UnimodalBranch(
    None,
    BimodalCSRPool(mode='max'),
    BimodalCSRPool(mode='mean'),
    BimodalFusion(mode='residual'))
mm_block = MultimodalBlockDown(conv_3d, None, image=branch).to('cuda')

# Prepare data with the expected dictionary format
mm_data_cuda = mm_data.clone().to('cuda')
mm_data_dict = {
    'x_3d': SparseTensor(
        torch.zeros(mm_data.data.rgb.shape, device=mm_data_cuda.device),
        mm_data_cuda.data.coords,
        mm_data_cuda.data.batch,
        mm_data_cuda.device),
    'x_seen': None,
    'modalities': {'image': mm_data_cuda.modalities['image'].clone()}}

# Forward the 3D downsampling
mm_data_dict_out = mm_block.forward(mm_data_dict)
out_3d = mm_data_dict_out['x_3d']
out_3d = Batch(
    x=out_3d.F,
    batch=out_3d.C[:, -1].long().to(out_3d.F.device),
    pos=(out_3d.C[:, :3] * voxel).float().to(out_3d.F.device),
    mapping_index=torch.arange(out_3d.F.shape[0]),
    rgb=torch.zeros(out_3d.F.shape[0], 3, dtype=torch.uint8),
    y=torch.zeros(out_3d.F.shape[0], dtype=torch.long))

mm_data_out = MMData(out_3d, **mm_data_dict_out['modalities']).to('cpu')

In [None]:
visualize_mm_data(mm_data_out, class_names=CLASSES, class_colors=OBJECT_COLOR, figsize=1000, voxel=0.05, show_3d=True, show_2d=True, color_mode='light', alpha=0.3, pointsize=5)

# 2D downsampling visualization with image module

Visualize the impact of 2D downsampling on the mapping. Here the downsampling is performed using a 2D strided conv with kernel `downsampling_2d x downsampling_2d`).

In [None]:
from torch_points3d.modules.multimodal.modules import UnimodalBranch
from torch_points3d.modules.multimodal.fusion import BimodalFusion
from torch_points3d.modules.multimodal.pooling import BimodalCSRPool
from torch_points3d.modules.multimodal.modalities.image import ResNetDown

# 2D resolution downscaling factor
downsampling_2d = 8

# Manually create a conv layer with averaging kernels
class SomeDownConv(torch.nn.Module):
    def __init__(self, stride, bias=False):
        super(SomeDownConv, self).__init__()
        conv_2d = torch.nn.Conv2d(3, 3, 2, stride=stride, bias=bias)
        with torch.no_grad():
            conv_2d.weight = torch.nn.Parameter(torch.zeros_like(conv_2d.weight))
            for i in range(3):
                conv_2d.weight[i, i] = 1
        self.conv_2d = conv_2d
    
    def forward(self, x, *args, **kwargs):
        return self.conv_2d(x)

branch = UnimodalBranch(
    SomeDownConv(downsampling_2d),
    BimodalCSRPool(mode='max'),
    BimodalCSRPool(mode='mean'),
    BimodalFusion(mode='residual'))

# Prepare data with the expected dictionary format 
mm_data_dict = {
    'x_3d': torch.zeros_like(mm_data.data.rgb),
    'x_seen': None,
    'modalities': {'image': mm_data.modalities['image'].clone()}}

# Forward the 2D downsampling
mm_data_dict_out = branch.forward(mm_data_dict, 'image')
x_3d = mm_data_dict_out['x_3d']
x_mod = mm_data_dict_out['modalities']['image']

In [None]:
mm_data_out = mm_data.clone()
mm_data_out.data.rgb = (x_3d - x_3d.min()) / (x_3d.max() + 1e-6)
mm_data_out.modalities['image'] = x_mod

visualize_mm_data(mm_data_out, class_names=CLASSES, class_colors=OBJECT_COLOR, figsize=1000, voxel=0.05, show_3d=True, show_2d=True, color_mode='light', alpha=0.3, pointsize=5)

# 3D-2D downsampling visualization in MultiModalBlock

Visualize the impact of 2D and 3D subsampling at once.

In [None]:
from torch_points3d.modules.multimodal.modules import MultimodalBlockDown, UnimodalBranch
from torch_points3d.modules.multimodal.fusion import BimodalFusion
from torch_points3d.modules.multimodal.pooling import BimodalCSRPool
from torch_points3d.modules.SparseConv3d.modules import ResNetDown
from torch_points3d.modules.SparseConv3d.nn.torchsparse import SparseTensor, Conv3d

# 2D-3D resolution downscaling factor
downsampling_2d = 4
downsampling_3d = 4

# Manually create a conv layer with averaging kernels
class SomeDownConv(torch.nn.Module):
    def __init__(self, stride, bias=False):
        super(SomeDownConv, self).__init__()
        conv_2d = torch.nn.Conv2d(3, 3, 2, stride=stride, bias=bias)
        with torch.no_grad():
            conv_2d.weight = torch.nn.Parameter(torch.zeros_like(conv_2d.weight))
            for i in range(3):
                conv_2d.weight[i, i] = 0.25
        self.conv_2d = conv_2d
    
    def forward(self, x, *args, **kwargs):
        return self.conv_2d(x)

branch = UnimodalBranch(
    SomeDownConv(downsampling_2d),
    BimodalCSRPool(mode='max'),
    BimodalCSRPool(mode='mean'),
    BimodalFusion(mode='residual'))

conv_3d = torch.nn.Sequential(
    Conv3d(in_channels=3, out_channels=3, kernel_size=3, stride=1),
    Conv3d(in_channels=3, out_channels=3, kernel_size=2, stride=downsampling_3d))

mm_block = MultimodalBlockDown(conv_3d, None, image=branch).to('cuda')

# Prepare data with the expected dictionary format
mm_data_cuda = mm_data.clone().to('cuda')
mm_data_dict = {
    'x_3d': SparseTensor(
        torch.zeros(mm_data.data.rgb.shape, device=mm_data_cuda.device),
        mm_data_cuda.data.coords,
        mm_data_cuda.data.batch,
        mm_data_cuda.device),
    'x_seen': None,
    'modalities': {'image': mm_data_cuda.modalities['image'].clone()}}

# Forward the 3D downsampling
mm_data_dict_out = mm_block.forward(mm_data_dict)
out_3d = mm_data_dict_out['x_3d']
out_3d = Batch(
    x=out_3d.F,
    batch=out_3d.C[:, -1].long().to(out_3d.F.device),
    pos=(out_3d.C[:, :3] * voxel).float().to(out_3d.F.device),
    mapping_index=torch.arange(out_3d.F.shape[0]),
    rgb=torch.zeros(out_3d.F.shape[0], 3, dtype=torch.uint8),
    y=torch.zeros(out_3d.F.shape[0], dtype=torch.long))

mm_data_out = MMData(out_3d, **mm_data_dict_out['modalities']).to('cpu')

In [None]:
visualize_mm_data(mm_data_out, class_names=CLASSES, class_colors=OBJECT_COLOR, figsize=1000, voxel=0.05, show_3d=True, show_2d=True, color_mode='light', alpha=0.3, pointsize=5)