In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn.functional as F
import tqdm

import raymarching2

rays_o = (torch.zeros((2000, 3)) + 0.1).to("cuda")
rays_d = torch.randn((2000, 3)).to("cuda")
rays_d = F.normalize(rays_d, dim=-1)

density_bitfield = (torch.ones(
    (5, 128 ** 3 // 8), dtype=torch.uint8
) * 255).to("cuda")

aabb = torch.tensor([0., 0., 0., 1., 1., 1.]).to("cuda")
torch.cuda.synchronize()

sigmas = torch.rand((2000, 1024), device=rays_o.device, requires_grad=True)
rgbs = torch.rand((2000, 1024, 3), device=rays_o.device, requires_grad=True)
bkgd_rgb = torch.rand(3).to("cuda")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch.profiler import profile, record_function, ProfilerActivity

In [10]:
indices, positions, dirs, deltas, ts = raymarching2.generate_training_samples(
    rays_o, rays_d, aabb, density_bitfield, 1024 * rays_o.shape[0]
)

sigmas_collector = []
rgbs_collector = []
for ray_id, sample_id, sample_cnt in indices:
    sigmas_collector.append(
        sigmas[ray_id, 0: sample_cnt]
    )
    rgbs_collector.append(
        rgbs[ray_id, 0: sample_cnt]
    )
sigmas_collector = torch.cat(sigmas_collector)
rgbs_collector = torch.cat(rgbs_collector)

sigmas_collector = sigmas_collector.detach().clone().requires_grad_(True)
rgbs_collector = rgbs_collector.detach().clone().requires_grad_(True)

for _ in tqdm.tqdm(range(1000)):
    with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("volumetric_rendering"):
            indices, positions, dirs, deltas, ts = raymarching2.generate_training_samples(
                rays_o, rays_d, aabb, density_bitfield, 1024 * rays_o.shape[0]
            )
            (
                accumulated_weight, 
                accumulated_depth, 
                accumulated_color, 
                accumulated_position
            ) = raymarching2.volumetric_rendering(
                indices, positions, deltas, ts,
                sigmas_collector, rgbs_collector,
                bkgd_rgb
            )
            accumulated_color.sum().backward()
    torch.cuda.synchronize()

print (prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
print ("accumulated_color", accumulated_color.sum())
print ("grad sigmas", sigmas_collector.grad.sum())
print ("grad rgbs", rgbs_collector.grad.sum())

_ = sigmas_collector.grad.zero_()
_ = rgbs_collector.grad.zero_()



100%|██████████| 1000/1000 [00:08<00:00, 122.06it/s]

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void volumetric_rendering_kernel<float>(unsigned int...         0.00%       0.000us         0.00%       0.000us       0.000us       2.251ms        35.94%       2.251ms       2.251ms             1  
void kernel_generate_training_samples<float>(unsigne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.081ms        33.22%       2.081ms       2.081ms             1  
void volu




In [11]:
def generate_training_samples(
    rays_o: torch.Tensor, 
    rays_d: torch.Tensor, 
    aabb: torch.Tensor, 
    max_samples: int = 10_000,
):
    device = rays_o.device
    NERF_STEPS = 1024
    STEPSIZE = 1.7320508075688772 / NERF_STEPS

    rays_o = rays_o.reshape(-1, 3)
    rays_d = rays_d.reshape(-1, 3)

    t_vals = torch.arange(0.0, NERF_STEPS, device=device) * STEPSIZE
    t_vals = t_vals[None, :].expand((rays_o.shape[0], -1))
    points = rays_o[:, None, :] + rays_d[:, None, :] * t_vals[:, :, None]

    selector = (
        (points[..., 0] >= aabb[0]) &
        (points[..., 1] >= aabb[1]) &
        (points[..., 2] >= aabb[2]) &
        (points[..., 0] <= aabb[3]) &
        (points[..., 1] <= aabb[4]) &
        (points[..., 2] <= aabb[5])
    )
    points[selector]
    return points, selector, t_vals


def volumetric_rendering(rgb, density, t_vals, dirs, color_bkgd):
    """Volumetric Rendering Function.
    Args:
        rgb: torch.ndarray(float32), color, [batch_size, num_samples, 3]
        density: torch.ndarray(float32), density, [batch_size, num_samples, 1].
        t_vals: torch.ndarray(float32), [batch_size, num_samples].
        dirs: torch.ndarray(float32), [batch_size, 3].
        color_bkgd: torch.ndarray(float32), [3].
    Returns:
        comp_rgb: torch.ndarray(float32), [batch_size, 3].
        disp: torch.ndarray(float32), [batch_size].
        acc: torch.ndarray(float32), [batch_size].
        weights: torch.ndarray(float32), [batch_size, num_samples]
    """
    t_dists = torch.cat(
        [
            t_vals[Ellipsis, 1:] - t_vals[Ellipsis, :-1],
            # torch.tensor(
            #     [1e10], dtype=t_vals.dtype, device=t_vals.device
            # ).expand(t_vals[Ellipsis, :1].shape),
            t_vals[Ellipsis, 1:2] - t_vals[Ellipsis, 0:1]
        ],
        -1,
    )
    delta = t_dists * torch.linalg.norm(dirs[Ellipsis, None, :], dim=-1)
    
    # Note that we're quietly turning density from [..., 0] to [...].
    density_delta = density[..., 0] * delta

    alpha = 1 - torch.exp(-density_delta)
    trans = torch.exp(
        -torch.cat(
            [
                torch.zeros_like(density_delta[..., :1]),
                torch.cumsum(density_delta[..., :-1], dim=-1),
            ],
            dim=-1,
        )
    )
    weights = alpha * trans

    comp_rgb = (weights[..., None] * rgb).sum(dim=-2)
    # print ("weights", weights)
    acc = weights.sum(dim=-1)
    # distance = (weights * t_mids).sum(dim=-1) / acc
    # distance = torch.clip(
    #     torch.nan_to_num(distance, torch.finfo().max), t_vals[:, 0], t_vals[:, -1]
    # )
    depth = (weights * t_vals).sum(dim=-1)
    eps = 1e-10
    inv_eps = 1 / eps
    # torch.where accepts <scaler, double tensor>
    disp = (acc / depth).double()
    disp = torch.where(
        (disp > 0) & (disp < inv_eps) & (acc > eps), disp, inv_eps
    )
    disp = disp.to(acc.dtype)

    comp_rgb = comp_rgb + color_bkgd * (1.0 - acc[..., None])
    return comp_rgb, depth, acc, weights

torch.cuda.synchronize()
for _ in tqdm.tqdm(range(1000)):
    with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("volumetric_rendering"):
            points, selector, t_vals = generate_training_samples(rays_o, rays_d, aabb)
            comp_rgb, depth, acc, weights = volumetric_rendering(
                rgbs, (sigmas * selector).unsqueeze(-1), t_vals, 
                rays_d, bkgd_rgb
            )
            comp_rgb.sum().backward()

    torch.cuda.synchronize()

print ("comp_rgb", comp_rgb.sum())
print ("grad sigmas", sigmas.grad.sum())
print ("grad rgbs", rgbs.grad.sum())
# print (rgbs.grad)
print (prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

_ = sigmas.grad.zero_()
_ = rgbs.grad.zero_()


100%|██████████| 1000/1000 [00:08<00:00, 116.70it/s]

comp_rgb tensor(3263.0479, device='cuda:0', grad_fn=<SumBackward0>)
grad sigmas tensor(-75984.8594, device='cuda:0')
grad rgbs tensor(882290.8750, device='cuda:0')
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     386.000us        15.16%     386.000us      48.250us             8  
void at::native::elementwise_kernel<128, 4,




In [None]:
import torch
import torch.nn.functional as F

import raymarching2


rays_o = (torch.zeros((10, 100, 3)) + 0.1).to("cuda")
rays_d = torch.randn((10, 100, 3)).to("cuda")
rays_d = F.normalize(rays_d, dim=-1)

density_bitfield = (torch.ones(
    (5, 128 ** 3 // 8), dtype=torch.uint8
) * 255).to("cuda")

aabb = torch.tensor([0., 0., 0., 1., 1., 1.]).to("cuda")

positions, dirs, deltas, nears, fars = raymarching2.generate_training_samples(
    rays_o, rays_d, aabb, density_bitfield
)
torch.cuda.synchronize()

In [None]:
from tava.utils.plotly import Trimesh, plot_scene, PointCloud

def aabb_to_mesh(aabb):
    vertices = torch.stack([
        aabb[[0, 1, 2]], 
        aabb[[3, 1, 2]], 
        aabb[[0, 4, 2]], 
        aabb[[0, 1, 5]], 
        aabb[[3, 4, 2]],
        aabb[[0, 4, 5]],
        aabb[[3, 1, 5]],
        aabb[[3, 4, 5]],
    ])
    faces = torch.tensor([
        [0, 1, 4], [0, 4, 2],
        [0, 3, 6], [0, 6, 1],
        [1, 6, 4], [4, 6, 7],
        [2, 4, 7], [2, 7, 5],
        [2, 5, 0], [0, 5, 3],
        [2, 4, 7], [2, 7, 5],
    ], dtype=torch.int32, device=aabb.device)
    return vertices, faces

vertices, faces = aabb_to_mesh(aabb)
plot_scene(
    {
        "bbox": {
            "struct": Trimesh(
                vertices.cpu().numpy(),
                faces.cpu().numpy(),
            ),
            "mesh_opacity": 0.7
        },
        "samples": {
            "struct": PointCloud(
                positions[positions.sum(dim=-1) > 0].cpu().numpy()
            )
        }
    }
)

In [None]:
(positions.sum(dim=-1) > 0).sum()