In [2]:
import sys
from pathlib import Path

# Add the src directory to the Python path
# sys.path.append(str(Path().resolve().parent / 'src'))

import torch
import numpy as np

from models.encoders.jepa3d_wrapper import JEPA3DEncoderWrapper
from ext.jepa3d.models.encoder_3djepa import Encoder3DJEPA
from ext.point2vec.tokenizer import PointCloudTokenizer

print("✅ Both imports resolve!")




✅ Both imports resolve!


In [None]:
# Load the point cloud data
data = np.load('../data/pointclouds/02691156_1021a0914a7207aff927ed529ad90a11_2048.npz')
pointcloud_np = data['points'] 

# Convert to PyTorch tensor
pointcloud_tensor = torch.from_numpy(pointcloud_np)
print(f"Pointcloud shape: {pointcloud_tensor.shape}")

# Get the p2v(point2vec) tokenizer features and centers
# One group(patch) = one point, each with 1536 dimensional feature 
tokenizer = PointCloudTokenizer(2048, 1, None, 1536)
tokens, centers = tokenizer(pointcloud_tensor.reshape(1, 2048, 3)) 

# Print some statistics about the tokenizer features and centers
print(f"Tokens shape: {tokens.shape}")
print(f"Centers shape: {centers.shape}")

print("Pointcloud statistics:")
print(f"  Mean: {centers.mean().item():.4f}")
print(f"  Std: {centers.std().item():.4f}")
print(f"  Min: {centers.min().item():.4f}")
print(f"  Max: {centers.max().item():.4f}")

p2v_output = tokens.squeeze(0)
p2v_positions = centers.squeeze(0)


Pointcloud shape: torch.Size([2048, 3])
Tokens shape: torch.Size([1, 2048, 1536])
Centers shape: torch.Size([1, 2048, 3])
Pointcloud statistics:
  Mean: 0.0280
  Std: 0.1147
  Min: -0.3595
  Max: 0.3609


In [None]:
def create_featurized_scene_dict(p2v_output, p2v_positions, num_points=2048, device='cpu', model=None):
    """
    Create a featurized scene dictionary that matches the expected input format
    for the 3D-JEPA encoder based on the forward method.
    
    Args:
        num_points: Number of points in the scene
        device: Device to create tensors on
        model: The model instance to check expected dimensions
    
    Returns:
        dict: featurized_scene_dict with all required keys
    """
    
    floor_mask = torch.rand(num_points, device=device) < 0.3
    ceiling_mask = (~floor_mask) & (torch.rand(num_points, device=device) < 0.2) 
    remaining_mask = ~floor_mask & ~ceiling_mask
    
    # Create RGB colors (0-1 range - model will multiply by 255)
    rgb = torch.rand(num_points, 3, device=device) 
    rgb[floor_mask] = torch.tensor([0.4, 0.3, 0.2], device=device) + torch.rand((floor_mask.sum(), 3), device=device) * 0.3  
    rgb[ceiling_mask] = torch.tensor([0.8, 0.8, 0.8], device=device) + torch.rand((ceiling_mask.sum(), 3), device=device) * 0.2
    rgb = torch.clamp(rgb, 0, 1)
    
    # Create CLIP and DINO features with correct dimensions from the point2vec output
    features_clip = p2v_output[:, :768].clone().detach().to(device)
    features_dino = p2v_output[:, 768:].clone().detach().to(device)
    
    # Scale the shapenet object to the assumed expected scale by 3djepa
    xyz = p2v_positions.clone().detach().to(device) * 7

    # Create the featurized scene dictionary
    featurized_scene_dict = {
        "features_clip": features_clip,      # Shape: (num_points, clip_feat_dim)
        "features_dino": features_dino,      # Shape: (num_points, dino_feat_dim)
        "rgb": rgb,                          # Shape: (num_points, 3) in [0,1] range
        "points": xyz,                       # Shape: (num_points, 3)
    }
    
    return featurized_scene_dict


In [55]:
# Import the pretrained 3djepa encoder 
model_3djepa = Encoder3DJEPA.from_pretrained("facebook/3d-jepa")
model_3djepa = model_3djepa.cuda()

if hasattr(model_3djepa, 'zero_token'):
    model_3djepa.zero_token = model_3djepa.zero_token.cuda()

featurized_scene_dict = create_featurized_scene_dict(
    p2v_output=p2v_output,
    p2v_positions=p2v_positions,
    num_points=2048, 
    model=model_3djepa,
    device=torch.device('cuda')
)

output = model_3djepa(featurized_scene_dict)

[DEBUG] features.shape = torch.Size([1, 2048, 1536])
[DEBUG] zero_token.shape = torch.Size([1536])


In [None]:
# Analysis of the output od the 3Djepa encoder
print(f"Model output keys: {output.keys()}")
if 'features' in output:
    print(f"Features shape: {output['features'].shape}")
if 'points' in output:
    print(f"Points shape: {output['points'].shape}")

# Print some statistics about the output features
if 'features' in output:
    features = output['features']
    print(f"\nFeature statistics:")
    print(f"  Mean: {features.mean().item():.4f}")
    print(f"  Std: {features.std().item():.4f}")
    print(f"  Min: {features.min().item():.4f}")
    print(f"  Max: {features.max().item():.4f}")

Model output keys: dict_keys(['features', 'points'])
Features shape: torch.Size([2048, 256])
Points shape: torch.Size([2048, 3])
Success! Generated embeddings from synthetic featurized scene.

Feature statistics:
  Mean: 0.0875
  Std: 1.9679
  Min: -12.3838
  Max: 13.7755
